-
Notifications
You must be signed in to change notification settings - Fork 51
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #228 from SysBioChalmers/gecko2devel
chore: merge GECKO2 development into GECKO2
- Loading branch information
Showing
5 changed files
with
458 additions
and
91 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,179 @@ | ||
function [uni,EC,MW,Genes,conflicts] = findInDBOld(grRule,DBprot,DBgenes,DBecNum,DBMW) | ||
% findInDBOld | ||
% Gets the uniprots and EC numbers for a given rxn into a given database - this is the old slow implementation, still used for KEGG | ||
% | ||
% grRule Genes association for a given metabolic reaction | ||
% DBprot array of uniprot IDs, taken from swissprot or kegg database | ||
% DBgenes array of genes corresponding to DBprot | ||
% DBecNum array of EC numbers corresponding to DBprot | ||
% DBMW array of molecular weights in Da corresponding to DBprot | ||
% | ||
% uni Array containing all the uniprot IDs related to an isoenzyme, | ||
% for a given reaction in each of its cells. | ||
% EC Array containing all the EC numbers related to an isoenzyme, | ||
% for a given reaction in each of its cells. | ||
% Genes Array containing all the genes related to an isoenzyme, | ||
% for a given reaction in each of its cells. | ||
% conflicts Contains those genes with multiple protein matches. | ||
% | ||
% Usage: [uni,EC,MW,Genes,conflicts] = findInDB(grRule,DBprot,DBgenes,DBecNum,DBMW) | ||
% | ||
% Benjamin J. Sanchez, 2017-08-10 | ||
% Ivan Domenzain, 2018-09-06 | ||
% | ||
|
||
%Find simple gene sets for reaction: | ||
gene_sets = getSimpleGeneSets(grRule); | ||
%Preallocate function variables | ||
uni = cell(size(gene_sets)); | ||
EC = cell(size(gene_sets)); | ||
MW = zeros(size(gene_sets)); | ||
Genes = cell(size(gene_sets)); | ||
conflicts = cell(1,2); | ||
|
||
for i = 1:length(gene_sets) | ||
%Split the gene set and match each gene: | ||
gene_set = strsplit(gene_sets{i},' and '); | ||
uni_set = cell(size(gene_set)); | ||
EC_set = cell(size(gene_set)); | ||
genesCell = cell(size(gene_set)); | ||
for j = 1:length(gene_set) | ||
gene = gene_set{j}; | ||
%Get the indexes for all of the proteins related to gene | ||
matches=find(sum(strcmpi(gene,DBgenes),2)); | ||
if ~isempty(matches) | ||
uni_set{j} = DBprot{matches}; | ||
genesCell{j} = gene; | ||
%Get the indexes of the matched protein(s) with non-empty EC#s | ||
nonEmpty = matches(~cellfun(@isempty,DBecNum(matches))); | ||
if ~isempty(nonEmpty) | ||
[geneECs,ia] = unique(DBecNum(nonEmpty),'stable'); | ||
%For genes with multiple proteins associated to several | ||
%non-empty ecNumbers | ||
if length(geneECs)>1 | ||
indexes = nonEmpty(ia); | ||
ecNum = DBecNum(indexes); | ||
%Save first match | ||
uni_set{j} = DBprot{indexes(1)}; | ||
EC_set{j} = getECstring(EC_set{j},ecNum{1}); | ||
%Save additional matches as potential conflicts | ||
if ~ismember(gene,conflicts{1}) | ||
conflicts{1} = [conflicts{1}; {gene}]; | ||
conflicts{2} = [conflicts{2}; {indexes}]; | ||
end | ||
else | ||
%If there is a single unique ec number for the matched | ||
%protein(s), then choose the lightest protein | ||
[~,minW] = min(cell2mat(DBMW(nonEmpty))); | ||
matches = nonEmpty(minW); | ||
uni_set{j} = DBprot{matches}; | ||
EC_set{j} = getECstring(EC_set{j},DBecNum{matches}); | ||
end | ||
end | ||
else | ||
uni_set{j} = ''; | ||
end | ||
|
||
if isempty(EC_set{j}) | ||
EC_set{j} = ''; | ||
end | ||
end | ||
%Uniprot: Delete repeated and empty spaces | ||
[uni_set,repeated] = deleteRepeated(uni_set); | ||
genesCell = genesCell(~repeated); | ||
emptyIndexes = cellfun(@isempty,uni_set); | ||
uni_set = uni_set(~emptyIndexes); | ||
genesCell = genesCell(~emptyIndexes); | ||
|
||
%EC: Find union and intersection between all units (only applies for | ||
%complexes, i.e. length(EC_set) > 1): | ||
uni_EC = strsplit(EC_set{1},' '); | ||
int_EC = uni_EC; | ||
for j = 2:length(EC_set) | ||
other_EC = strsplit(EC_set{j},' '); | ||
if isempty(uni_EC) | ||
uni_EC = other_EC; | ||
int_EC = other_EC; | ||
elseif ~isempty(other_EC) | ||
uni_EC = compare_wild([uni_EC other_EC]); | ||
int_EC = intersection(int_EC,other_EC); | ||
end | ||
end | ||
%Use the intersection found, if any. If not, use the union: | ||
if isempty(int_EC) | ||
EC_set = uni_EC; | ||
else | ||
EC_set = int_EC; | ||
end | ||
%MW: use uniprot codes + swissprot table: | ||
for j = 1:length(uni_set) | ||
index = strcmpi(DBprot(:),uni_set{j}); | ||
minWeight = min(DBMW{index}); | ||
MW(i) = MW(i) + minWeight; | ||
end | ||
%Add new codes as new possible isoenzymes: | ||
uni{i} = strjoin(uni_set,' '); | ||
EC{i} = strjoin(EC_set,' '); | ||
Genes{i} = strjoin(genesCell,' '); | ||
end | ||
|
||
%Delete repeated Uniprots (and the corresponding ECs): | ||
[uni,deleted] = deleteRepeated(uni); | ||
EC(deleted) = []; | ||
MW(deleted) = []; | ||
|
||
end | ||
|
||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||
function int_EC = intersection(prev_EC,new_EC) | ||
%Finds the common elements between two cell arrays, if any. Also considers | ||
%wildcards (e.g. if 'EC1.1.1.1' is in one array and 'EC1.1.1.-' is in the | ||
%other one, then 'EC1.1.1.1' is added to the intersection). | ||
int_EC = {}; | ||
for i = 1:length(prev_EC) | ||
for j = 1:length(new_EC) | ||
new_int = compare_wild({prev_EC{i} new_EC{j}}); | ||
if length(new_int) == 1 | ||
int_EC = [int_EC new_int]; | ||
end | ||
end | ||
end | ||
end | ||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||
function int_EC = compare_wild(EC) | ||
%Goes through a cell array of EC numbers, and erases any repetitions, | ||
%considering also wildcards (e.g. will erase 'EC1.2.3.-' if 'EC1.2.3.4' is | ||
%already present). | ||
|
||
%Trim all EC numbers of wild cards (e.g. 'EC1.2.-.-' -> 'EC1.2.'): | ||
EC_trimmed = EC; | ||
for i = 1:length(EC) | ||
%Add a final dot for avoiding issues (e.g. 'EC1.1.1.1' & 'EC1.1.1.12'): | ||
ECi = [EC{i} '.']; | ||
pos = strfind(ECi,'-'); | ||
if ~isempty(pos) | ||
ECi = ECi(1:pos(1)-1); | ||
end | ||
EC_trimmed{i} = ECi; | ||
end | ||
|
||
%Compare all EC numbers between them to find if 1 fits in the other: | ||
non_repeated = true(1,length(EC)); | ||
for i = 1:length(EC)-1 | ||
for j = i+1:length(EC) | ||
ECi = EC_trimmed{i}; | ||
ECj = EC_trimmed{j}; | ||
%If ECj fits in ECi then ECj can be disregarded: | ||
if strfind(ECi,ECj) | ||
non_repeated(j) = false; | ||
%Else, if ECi fits in ECj then ECi can be disregarded: | ||
elseif strfind(ECj,ECi) | ||
non_repeated(i) = false; | ||
end | ||
end | ||
end | ||
|
||
int_EC = EC(non_repeated); | ||
|
||
end | ||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
Oops, something went wrong.