diff --git a/geckomat/get_enzyme_data/findInDB.m b/geckomat/get_enzyme_data/findInDB.m index 06b702d54..cffdef468 100644 --- a/geckomat/get_enzyme_data/findInDB.m +++ b/geckomat/get_enzyme_data/findInDB.m @@ -1,4 +1,4 @@ -function [uni,EC,MW,Genes,conflicts] = findInDB(grRule,DBprot,DBgenes,DBecNum,DBMW) +function [uni,EC,MW,Genes,conflicts] = findInDB(grRule,DBprot,DBgenes,DBecNum,DBMW, geneIndex, geneHashMap) % findInDB % Gets the uniprots and EC numbers for a given rxn into a given database % @@ -20,6 +20,7 @@ % % Benjamin J. Sanchez, 2017-08-10 % Ivan Domenzain, 2018-09-06 +% Johan Gustafsson, 2021-07-02 introduced optimizations from GeckoLight % %Find simple gene sets for reaction: @@ -33,19 +34,40 @@ for i = 1:length(gene_sets) %Split the gene set and match each gene: - gene_set = strsplit(gene_sets{i},' and '); + gene_set = strsplit(gene_sets{i},' and '); %this string splitting together with getSimpleGeneSets takes 2.6 s, that is ok. uni_set = cell(size(gene_set)); + uni_set_idx = cell(size(gene_set)); EC_set = cell(size(gene_set)); genesCell = cell(size(gene_set)); + + %find index indices of all genes in one call + + %this loop is slow for j = 1:length(gene_set) - gene = gene_set{j}; - %Get the indexes for all of the proteins related to gene - matches=find(sum(strcmpi(gene,DBgenes),2)); - if ~isempty(matches) + gene = gene_set{j}; + matches = []; + %Get the indexes for all of the proteins related to gene + if isKey(geneHashMap,gene) %annoyingly, this seems to be needed + matchInd = cell2mat(values(geneHashMap,gene_set(j))); + matches = geneIndex{matchInd}; + %matches2=find(sum(strcmpi(gene,DBgenes),2)); + %if (length(matches) ~= length(matches2)) + % disp('misfit length') + %end + %for i = 1:length(matches) + % if (~all(matches == matches2)) + % disp('misfit numbers') + % end + %end + end +% end %remove later + %matches=find(sum(strcmpi(gene,DBgenes),2)); + if ~isempty(matches) uni_set{j} = DBprot{matches}; + uni_set_idx{j} = matches; genesCell{j} = gene; %Get the indexes of the matched protein(s) with non-empty EC#s - nonEmpty = matches(~cellfun(@isempty,DBecNum(matches))); + nonEmpty = matches(~cellfun(@isempty,DBecNum(matches))); if ~isempty(nonEmpty) [geneECs,ia] = unique(DBecNum(nonEmpty),'stable'); %For genes with multiple proteins associated to several @@ -55,6 +77,7 @@ ecNum = DBecNum(indexes); %Save first match uni_set{j} = DBprot{indexes(1)}; + uni_set_idx{j} = indexes(1); EC_set{j} = getECstring(EC_set{j},ecNum{1}); %Save additional matches as potential conflicts if ~ismember(gene,conflicts{1}) @@ -67,24 +90,28 @@ [~,minW] = min(cell2mat(DBMW(nonEmpty))); matches = nonEmpty(minW); uni_set{j} = DBprot{matches}; + uni_set_idx{j} = matches; EC_set{j} = getECstring(EC_set{j},DBecNum{matches}); end end else uni_set{j} = ''; + uni_set_idx{j} = NaN; end if isempty(EC_set{j}) EC_set{j} = ''; end - end + end %Uniprot: Delete repeated and empty spaces [uni_set,repeated] = deleteRepeated(uni_set); + uni_set_idx = uni_set_idx(~repeated); genesCell = genesCell(~repeated); emptyIndexes = cellfun(@isempty,uni_set); uni_set = uni_set(~emptyIndexes); + uni_set_idx = uni_set_idx(~emptyIndexes); genesCell = genesCell(~emptyIndexes); - + %EC: Find union and intersection between all units (only applies for %complexes, i.e. length(EC_set) > 1): uni_EC = strsplit(EC_set{1},' '); @@ -107,7 +134,13 @@ end %MW: use uniprot codes + swissprot table: for j = 1:length(uni_set) - index = strcmpi(DBprot(:),uni_set{j}); + %index2 = strcmpi(DBprot(:),uni_set{j}); + index = uni_set_idx{j}; + %if (length(find(index2)) ~= length(index)) + % disp('Length error') + %elseif (~all(index == find(index2))) + % disp('value error') + %end minWeight = min(DBMW{index}); MW(i) = MW(i) + minWeight; end @@ -115,7 +148,7 @@ uni{i} = strjoin(uni_set,' '); EC{i} = strjoin(EC_set,' '); Genes{i} = strjoin(genesCell,' '); -end + end %Delete repeated Uniprots (and the corresponding ECs): [uni,deleted] = deleteRepeated(uni); diff --git a/geckomat/get_enzyme_data/findInDBOld.m b/geckomat/get_enzyme_data/findInDBOld.m new file mode 100644 index 000000000..d37507a2d --- /dev/null +++ b/geckomat/get_enzyme_data/findInDBOld.m @@ -0,0 +1,179 @@ +function [uni,EC,MW,Genes,conflicts] = findInDBOld(grRule,DBprot,DBgenes,DBecNum,DBMW) +% findInDBOld +% Gets the uniprots and EC numbers for a given rxn into a given database - this is the old slow implementation, still used for KEGG +% +% grRule Genes association for a given metabolic reaction +% DBprot array of uniprot IDs, taken from swissprot or kegg database +% DBgenes array of genes corresponding to DBprot +% DBecNum array of EC numbers corresponding to DBprot +% DBMW array of molecular weights in Da corresponding to DBprot +% +% uni Array containing all the uniprot IDs related to an isoenzyme, +% for a given reaction in each of its cells. +% EC Array containing all the EC numbers related to an isoenzyme, +% for a given reaction in each of its cells. +% Genes Array containing all the genes related to an isoenzyme, +% for a given reaction in each of its cells. +% conflicts Contains those genes with multiple protein matches. +% +% Usage: [uni,EC,MW,Genes,conflicts] = findInDB(grRule,DBprot,DBgenes,DBecNum,DBMW) +% +% Benjamin J. Sanchez, 2017-08-10 +% Ivan Domenzain, 2018-09-06 +% + +%Find simple gene sets for reaction: +gene_sets = getSimpleGeneSets(grRule); +%Preallocate function variables +uni = cell(size(gene_sets)); +EC = cell(size(gene_sets)); +MW = zeros(size(gene_sets)); +Genes = cell(size(gene_sets)); +conflicts = cell(1,2); + +for i = 1:length(gene_sets) + %Split the gene set and match each gene: + gene_set = strsplit(gene_sets{i},' and '); + uni_set = cell(size(gene_set)); + EC_set = cell(size(gene_set)); + genesCell = cell(size(gene_set)); + for j = 1:length(gene_set) + gene = gene_set{j}; + %Get the indexes for all of the proteins related to gene + matches=find(sum(strcmpi(gene,DBgenes),2)); + if ~isempty(matches) + uni_set{j} = DBprot{matches}; + genesCell{j} = gene; + %Get the indexes of the matched protein(s) with non-empty EC#s + nonEmpty = matches(~cellfun(@isempty,DBecNum(matches))); + if ~isempty(nonEmpty) + [geneECs,ia] = unique(DBecNum(nonEmpty),'stable'); + %For genes with multiple proteins associated to several + %non-empty ecNumbers + if length(geneECs)>1 + indexes = nonEmpty(ia); + ecNum = DBecNum(indexes); + %Save first match + uni_set{j} = DBprot{indexes(1)}; + EC_set{j} = getECstring(EC_set{j},ecNum{1}); + %Save additional matches as potential conflicts + if ~ismember(gene,conflicts{1}) + conflicts{1} = [conflicts{1}; {gene}]; + conflicts{2} = [conflicts{2}; {indexes}]; + end + else + %If there is a single unique ec number for the matched + %protein(s), then choose the lightest protein + [~,minW] = min(cell2mat(DBMW(nonEmpty))); + matches = nonEmpty(minW); + uni_set{j} = DBprot{matches}; + EC_set{j} = getECstring(EC_set{j},DBecNum{matches}); + end + end + else + uni_set{j} = ''; + end + + if isempty(EC_set{j}) + EC_set{j} = ''; + end + end + %Uniprot: Delete repeated and empty spaces + [uni_set,repeated] = deleteRepeated(uni_set); + genesCell = genesCell(~repeated); + emptyIndexes = cellfun(@isempty,uni_set); + uni_set = uni_set(~emptyIndexes); + genesCell = genesCell(~emptyIndexes); + + %EC: Find union and intersection between all units (only applies for + %complexes, i.e. length(EC_set) > 1): + uni_EC = strsplit(EC_set{1},' '); + int_EC = uni_EC; + for j = 2:length(EC_set) + other_EC = strsplit(EC_set{j},' '); + if isempty(uni_EC) + uni_EC = other_EC; + int_EC = other_EC; + elseif ~isempty(other_EC) + uni_EC = compare_wild([uni_EC other_EC]); + int_EC = intersection(int_EC,other_EC); + end + end + %Use the intersection found, if any. If not, use the union: + if isempty(int_EC) + EC_set = uni_EC; + else + EC_set = int_EC; + end + %MW: use uniprot codes + swissprot table: + for j = 1:length(uni_set) + index = strcmpi(DBprot(:),uni_set{j}); + minWeight = min(DBMW{index}); + MW(i) = MW(i) + minWeight; + end + %Add new codes as new possible isoenzymes: + uni{i} = strjoin(uni_set,' '); + EC{i} = strjoin(EC_set,' '); + Genes{i} = strjoin(genesCell,' '); +end + +%Delete repeated Uniprots (and the corresponding ECs): +[uni,deleted] = deleteRepeated(uni); +EC(deleted) = []; +MW(deleted) = []; + +end + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function int_EC = intersection(prev_EC,new_EC) +%Finds the common elements between two cell arrays, if any. Also considers +%wildcards (e.g. if 'EC1.1.1.1' is in one array and 'EC1.1.1.-' is in the +%other one, then 'EC1.1.1.1' is added to the intersection). +int_EC = {}; +for i = 1:length(prev_EC) + for j = 1:length(new_EC) + new_int = compare_wild({prev_EC{i} new_EC{j}}); + if length(new_int) == 1 + int_EC = [int_EC new_int]; + end + end +end +end +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function int_EC = compare_wild(EC) +%Goes through a cell array of EC numbers, and erases any repetitions, +%considering also wildcards (e.g. will erase 'EC1.2.3.-' if 'EC1.2.3.4' is +%already present). + +%Trim all EC numbers of wild cards (e.g. 'EC1.2.-.-' -> 'EC1.2.'): +EC_trimmed = EC; +for i = 1:length(EC) + %Add a final dot for avoiding issues (e.g. 'EC1.1.1.1' & 'EC1.1.1.12'): + ECi = [EC{i} '.']; + pos = strfind(ECi,'-'); + if ~isempty(pos) + ECi = ECi(1:pos(1)-1); + end + EC_trimmed{i} = ECi; +end + +%Compare all EC numbers between them to find if 1 fits in the other: +non_repeated = true(1,length(EC)); +for i = 1:length(EC)-1 + for j = i+1:length(EC) + ECi = EC_trimmed{i}; + ECj = EC_trimmed{j}; + %If ECj fits in ECi then ECj can be disregarded: + if strfind(ECi,ECj) + non_repeated(j) = false; + %Else, if ECi fits in ECj then ECi can be disregarded: + elseif strfind(ECj,ECi) + non_repeated(i) = false; + end + end +end + +int_EC = EC(non_repeated); + +end +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \ No newline at end of file diff --git a/geckomat/get_enzyme_data/getEnzymeCodes.m b/geckomat/get_enzyme_data/getEnzymeCodes.m index fa3994bcd..4d71790d8 100644 --- a/geckomat/get_enzyme_data/getEnzymeCodes.m +++ b/geckomat/get_enzyme_data/getEnzymeCodes.m @@ -26,10 +26,11 @@ % % Benjamin Sanchez. Last edited: 2017-03-05 % Ivan Domenzain. Last edited: 2018-09-07 +% Johan Gustafsson Last edited: 2021-07-02 Introduced optimizations from Gecko Light %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -function model_data = getEnzymeCodes(model,action) +function model_data = getEnzymeCodes(model, action) -if nargin<2 +if nargin<3 action = 'display'; end @@ -58,7 +59,9 @@ [m,n] = size(model.S); substrates = cell(n,20); +substrateIndices = zeros(n,20); products = cell(n,20); +productIndices = zeros(n,20); uniprots = cell(n,20); Genes = cell(n,20); EC_numbers = cell(n,20); @@ -67,6 +70,26 @@ count = zeros(4,1); rgmat = full(model.rxnGeneMat); conflicts = cell(1,4); + +%Build an index from gene to prot for faster processing later +[x,y] = size(DBgenesSwissprot); +genesForIndex = reshape(DBgenesSwissprot, x*y, 1); +genesForIndex = genesForIndex(~cellfun(@isempty, genesForIndex)); +genesForIndex = unique(genesForIndex); %18360 +geneIndex = cell(length(genesForIndex),1); +geneHashMap = containers.Map(genesForIndex,1:length(genesForIndex)); +protIndices = 1:length(DBgenesSwissprot(:,1)); +for i = 1:y + tmp1 = DBgenesSwissprot(:,i); + sel = ~cellfun(@isempty, tmp1); + indices = cell2mat(values(geneHashMap,tmp1(sel))); + protIndicesSel = protIndices(sel); + for j = 1:length(indices) + geneIndex{indices(j)} = [geneIndex{indices(j)};protIndicesSel(j)]; + end +end + + for i = 1:n ks = 1; kp = 1; @@ -74,23 +97,32 @@ inv = 0; DB = ''; %Save the substrates and products (if rxn is reversible): - for j = 1:m - if model.S(j,i) < 0 && model.ub(i) > 0 - substrates{i,ks} = model.metNames{j}; - ks = ks+1; - dir = 1; - elseif model.S(j,i) > 0 && model.lb(i) < 0 - products{i,kp} = model.metNames{j}; - kp = kp+1; - inv = 1; - end + %This code was opimized (a loop through every element was replaced) + %It was tested and produces the same results, but roughly 2000 times faster + %This part executes in roughly 0.3 seconds for all genes, so it is no + %longer an issue regarding performance + if model.ub(i) > 0 + sel = model.S(:,i) < 0; + sbstrs = model.metNames(sel); + substrates(i,1:length(sbstrs)) = sbstrs; + ind = find(sel); + substrateIndices(i,1:length(ind)) = ind; + dir = length(sbstrs) > 0; + end + if model.lb(i) < 0 + sel = model.S(:,i) > 0; + prds = model.metNames(sel); + products(i,1:length(prds)) = prds; + ind = find(sel); + productIndices(i,1:length(ind)) = ind; + inv = length(prds) > 0; end %isrev(i) = 0 if rxn is blocked, = 1 if non-reversible, and = 2 if %reversible: isrev(i) = dir + inv; if ~isempty(model.grRules{i}) %Find match in Swissprot: - [new_uni,new_EC,new_MW,newGene,multGenes] = findInDB(model.grRules{i},DBprotSwissprot,DBgenesSwissprot,DBecNumSwissprot,DBMWSwissprot); + [new_uni,new_EC,new_MW,newGene,multGenes] = findInDB(model.grRules{i},DBprotSwissprot,DBgenesSwissprot,DBecNumSwissprot,DBMWSwissprot,geneIndex,geneHashMap); if ~isempty(union_string(new_EC)) count(1) = count(1) + isrev(i); DBase = 'swissprot'; @@ -98,8 +130,8 @@ multGenes{3} = DBase; end else - %Find match in KEGG: - [new_uni,new_EC,new_MW,newGene,multGenes] = findInDB(model.grRules{i},DBprotKEGG,DBgenesKEGG,DBecNumKEGG,DBMWKEGG); + %Find match in KEGG (skipped optimizing this step) - may be good to fix this later to get rid of the findInDBOld function: + [new_uni,new_EC,new_MW,newGene,multGenes] = findInDBOld(model.grRules{i},DBprotKEGG,DBgenesKEGG,DBecNumKEGG,DBMWKEGG); if ~isempty(union_string(new_EC)) count(2) = count(2) + isrev(i); DBase = 'kegg'; @@ -157,7 +189,9 @@ end model_data.model = model; model_data.substrates = substrates; +model_data.substrateIndices = substrateIndices; %for optimizing matchKcats model_data.products = products; +model_data.productIndices = productIndices; %for optimizing matchKcats model_data.matchedGenes = Genes; model_data.uniprots = uniprots; model_data.EC_numbers = EC_numbers; diff --git a/geckomat/get_enzyme_data/loadBRENDAdata.m b/geckomat/get_enzyme_data/loadBRENDAdata.m index 2bb50024c..84537a976 100644 --- a/geckomat/get_enzyme_data/loadBRENDAdata.m +++ b/geckomat/get_enzyme_data/loadBRENDAdata.m @@ -15,11 +15,29 @@ SAcell{i} = []; end previousEC = []; EC_indexes = []; + + %build an index on MW{1} to speed things up a bit + %first just extract the genus (i.e. the first part of the name) + MWECNum = upper(unique(MW{1})); + MWECNumIndices = cell(length(MWECNum),1); + MWECNumHashMap = containers.Map(MWECNum,1:length(MWECNum)); + for i = 1:length(MW{1}) + matchInd = cell2mat(values(MWECNumHashMap, MW{1}(i))); + MWECNumIndices{matchInd} = [MWECNumIndices{matchInd};i]; + end + + for i=1:length(SA{1}) %Gets the indexes of the EC repetitions in the MW cell for every %new (different) EC if ~strcmpi(SA{1}(i), previousEC) - EC_indexes = find(strcmpi(SA{1}(i),MW{1})); + key = upper(SA{1}(i)); + if isKey(MWECNumHashMap,key) %annoyingly, this seems to be needed + matchInd = cell2mat(values(MWECNumHashMap,key)); + EC_indexes = MWECNumIndices{matchInd}; + else + EC_indexes = []; + end end mwEC{1} = MW{3}(EC_indexes); mwEC{2} = MW{4}(EC_indexes); % just looks for the first match because just the maximal value for diff --git a/geckomat/get_enzyme_data/matchKcats.m b/geckomat/get_enzyme_data/matchKcats.m index ab0da16c9..93901ac7f 100644 --- a/geckomat/get_enzyme_data/matchKcats.m +++ b/geckomat/get_enzyme_data/matchKcats.m @@ -1,9 +1,11 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - % kcats = matchKcats(model_data,org_name) + % kcats = matchKcats(model_data,org_name, minAcceptableKCat) % Matchs the model EC numbers and substrates to the BRENDA database, to % return the corresponding kcats for each reaction. % % INPUT: Model data structure (generated by getECnumbers.m) +% minAcceptableKCat [optional] Set this value if you want to +% replace too low kcats with a minimum value (unit: s^-1) % OUTPUTS: kcats, which contains: % *forw.kcats: kcat values for the forward reactions (mxn) % *forw.org_s: Number of matches for organism - substrate in @@ -47,8 +49,15 @@ % % Benjamin J. Sanchez. Last edited: 2016-03-01 % Ivan Domenzain. Last edited: 2018-01-16 + % Johan Gustafsson Last edited: 2021-07-02 Introduced optimizations from GeckoLight %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - function kcats = matchKcats(model_data, org_name) + function kcats = matchKcats(model_data, org_name, minAcceptableKCat) + + if nargin < 3 + minAcceptableKCat = 0; + end + + minAcceptableKCat = minAcceptableKCat * 3600;%convert to per hour fprintf('Matching kcats...') @@ -60,9 +69,29 @@ phylDistStruct = KEGG_struct; %Get the KEGG code for the model's organism org_index = find_inKEGG(org_name,phylDistStruct.names); + %build an index for genus in the phyl dist struct + %first just extract the genus (i.e. the first part of the name) + phylDistStruct.genus = cell(length(phylDistStruct.names),1); + for i = 1:length(phylDistStruct.genus) + name = phylDistStruct.names{i}; + phylDistStruct.genus{i} = lower(name(1:(strfind(name,' ')-1))); %convert all to lower case to avoid problems with case + end + %create a map for the genuses + phylDistStruct.uniqueGenusList = unique(phylDistStruct.genus); + phylDistStruct.genusHashMap = containers.Map(phylDistStruct.uniqueGenusList,1:length(phylDistStruct.uniqueGenusList)); + phylDistStruct.uniqueGenusIndices = cell(length(phylDistStruct.uniqueGenusList),1); + + %Then for each genus create a list with indices to the names + for i = 1:length(phylDistStruct.genus) + matchInd = cell2mat(values(phylDistStruct.genusHashMap,phylDistStruct.genus(i))); + phylDistStruct.uniqueGenusIndices{matchInd} = [phylDistStruct.uniqueGenusIndices{matchInd};i]; + end + %Extract relevant info from model_data: substrates = model_data.substrates; + substrateIndices = model_data.substrateIndices; products = model_data.products; + productIndices = model_data.productIndices; EC_numbers = model_data.EC_numbers; MWs = model_data.MWs; model = model_data.model; @@ -75,6 +104,7 @@ forw.rest_ns = zeros(mM,nM); forw.org_sa = zeros(mM,nM); forw.rest_sa = zeros(mM,nM); + forw.wcLevel = NaN(mM,nM); back.kcats = zeros(mM,nM); back.org_s = zeros(mM,nM); back.rest_s = zeros(mM,nM); @@ -82,6 +112,7 @@ back.rest_ns = zeros(mM,nM); back.org_sa = zeros(mM,nM); back.rest_sa = zeros(mM,nM); + back.wcLevel = NaN(mM,nM); tot.queries = 0; tot.org_s = 0; tot.rest_s = 0; @@ -96,23 +127,39 @@ tot.wc4 = 0; tot.matrix = zeros(6,5); + %build an EC index to speed things up a bit - many of the ECs appear + %many times - unnecessary to compare them all + %so, here, each EC string appears only once, and you get a vector with + %indices to the rows in KCATcell + [ECIndexIds,~,ic] = unique(KCATcell{1}); + EcIndexIndices = cell(length(ECIndexIds),1); + for i = 1:length(EcIndexIndices) + EcIndexIndices{i} = find(ic == i).'; + end + + + %Main loop: for i = 1:mM %Match: for j = 1:nM EC = EC_numbers{i,j}; MW = MWs(i,j); + if (isempty(EC)) + break; + end + EC = strsplit(EC,' '); %Try to match direct reaction: - if ~isempty(EC) && ~isempty(substrates{i,1}) + if ~isempty(substrates{i,1}) [forw,tot] = iterativeMatch(EC,MW,substrates(i,:),i,j,KCATcell,... forw,tot,model,org_name,... - phylDistStruct,org_index,SAcell); + phylDistStruct,org_index,SAcell,ECIndexIds,EcIndexIndices, substrateIndices(i,:), minAcceptableKCat); end %Repeat for inverse reaction: - if ~isempty(EC) && ~isempty(products{i,1}) + if ~isempty(products{i,1}) [back,tot] = iterativeMatch(EC,MW,products(i,:),i,j,KCATcell,... back,tot,model,org_name,... - phylDistStruct,org_index,SAcell); + phylDistStruct,org_index,SAcell,ECIndexIds,EcIndexIndices, productIndices(i,:), minAcceptableKCat); end end %Display progress: @@ -130,11 +177,10 @@ end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function [dir,tot] =iterativeMatch(EC,MW,subs,i,j,KCATcell,dir,tot,model,... - name,phylDist,org_index,SAcell) + name,phylDist,org_index,SAcell,ECIndexIds,EcIndexIndices, subsIndices, minAcceptableKCat) %Will iteratively try to match the EC number to some registry in BRENDA, %using each time one additional wildcard. - EC = strsplit(EC,' '); kcat = zeros(size(EC)); origin = zeros(size(EC)); matches = zeros(size(EC)); @@ -145,7 +191,7 @@ %Atempt match: [kcat(k),origin(k),matches(k)] = mainMatch(EC{k},MW,subs,KCATcell,... model,i,name,phylDist,... - org_index,SAcell); + org_index,SAcell,ECIndexIds,EcIndexIndices,subsIndices, minAcceptableKCat); %If any match found, ends. If not, introduces one extra wild card and %tries again: if origin(k) > 0 @@ -180,6 +226,7 @@ dir.org_sa(i,j) = matches*(origin == 4); dir.rest_ns(i,j) = matches*(origin == 5); dir.rest_sa(i,j) = matches*(origin == 6); + dir.wcLevel(i,j) = wc_num; tot.org_s = tot.org_s + (origin == 1); tot.rest_s = tot.rest_s + (origin == 2); tot.org_ns = tot.org_ns + (origin == 3); @@ -199,48 +246,62 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function [kcat,origin,matches] = mainMatch(EC,MW,subs,KCATcell,model,i,... - name,phylDist,org_index,SAcell) + name,phylDist,org_index,SAcell,ECIndexIds,EcIndexIndices,subsIndices, minAcceptableKCat) + %First make the string matching. This takes time, so we only want to do + %this once: + %Relaxes matching if wild cards are present: + wild = false; + wild_pos = strfind(EC,'-'); + if ~isempty(wild_pos) + EC = EC(1:wild_pos(1)-1); + wild = true; + end + stringMatchesEC_cell = extract_string_matches(EC,KCATcell{1},wild,ECIndexIds,EcIndexIndices); + % Matching function prioritizing organism and substrate specificity when % available. origin = 0; %First try to match organism and substrate: [kcat,matches] = matchKcat(EC,MW,subs,KCATcell,name,true,false,model,i,... - phylDist,org_index,SAcell); + phylDist,org_index,SAcell,stringMatchesEC_cell,[],subsIndices, minAcceptableKCat); if matches > 0 origin = 1; %If no match, try the closest organism but match the substrate: else [kcat,matches] = matchKcat(EC,MW,subs,KCATcell,'',true,false,model,i,... - phylDist,org_index,SAcell); + phylDist,org_index,SAcell,stringMatchesEC_cell,[],subsIndices, minAcceptableKCat); if matches > 0 origin = 2; %If no match, try to match organism but with any substrate: else [kcat,matches] = matchKcat(EC,MW,subs,KCATcell,name,false,false,... - model,i,phylDist,org_index,SAcell); + model,i,phylDist,org_index,SAcell,stringMatchesEC_cell,[],subsIndices, minAcceptableKCat); if matches > 0 origin = 3; %If no match, try to match organism but for any substrate (SA*MW): else + %create matching index for SA, has not been needed until now + stringMatchesSA = extract_string_matches(EC,SAcell{1},wild,[],[]); + [kcat,matches] = matchKcat(EC,MW,subs,KCATcell,name,false,... true,model,i,phylDist,org_index,... - SAcell); + SAcell,stringMatchesEC_cell,stringMatchesSA,subsIndices, minAcceptableKCat); if matches > 0 origin = 4; %If no match, try any organism and any substrate: else [kcat,matches] = matchKcat(EC,MW,subs,KCATcell,'',false,... false,model,i,phylDist,... - org_index,SAcell); + org_index,SAcell,stringMatchesEC_cell,stringMatchesSA,subsIndices, minAcceptableKCat); if matches > 0 origin = 5; %Again if no match, look for any org and SA*MW else [kcat,matches] = matchKcat(EC,MW,subs,KCATcell,'',... false,true,model,i,phylDist,... - org_index, SAcell); + org_index,SAcell,stringMatchesEC_cell,stringMatchesSA,subsIndices, minAcceptableKCat); if matches > 0 origin = 6; end @@ -254,45 +315,61 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function [kcat,matches] = matchKcat(EC,MW,subs,KCATcell,organism,... substrate,SA,model,i,phylDist,... - org_index,SAcell) + org_index,SAcell,KCATcellMatches,SAcellMatches,subsIndices,minAcceptableKCat) %Will go through BRENDA and will record any match. Afterwards, it will %return the average value and the number of matches attained. kcat = []; matches = 0; - %Relaxes matching if wild cards are present: - wild = false; - wild_pos = strfind(EC,'-'); - if ~isempty(wild_pos) - EC = EC(1:wild_pos(1)-1); - wild = true; - end - + if SA - EC_indexes = extract_indexes(EC,SAcell{1},[],SAcell{2},subs,substrate,... - organism,org_index,phylDist,wild); + %SAcell{1},wild,[],[] + EC_indexes = extract_indexes(SAcellMatches,[],SAcell{2},subs,substrate,... + organism,org_index,phylDist); + kcat = SAcell{3}(EC_indexes); org_cell = SAcell{2}(EC_indexes); MW_BRENDA = SAcell{4}(EC_indexes); + + %to handle bad kcat values that totally dominate the modeling, we do + %not accept a lower kcat than 1 s^-1, i.e. 3600 h^-1 + %need to handle this in several places, since it is sometimes modified + %for stoichiometry + kcat(kcat < minAcceptableKCat) = minAcceptableKCat; else - EC_indexes = extract_indexes(EC,KCATcell{1},KCATcell{2},KCATcell{3},... + %KCATcell{1},wild,ECIndexIds,EcIndexIndices + EC_indexes = extract_indexes(KCATcellMatches,KCATcell{2},KCATcell{3},... subs,substrate,organism,org_index,... - phylDist,wild); + phylDist); if substrate for j = 1:length(EC_indexes) indx = EC_indexes(j); for k = 1:length(subs) - l = logical(strcmpi(model.metNames,subs{k}).*(model.S(:,i)~=0)); + if (isempty(subs{k})) + break; + end + %l = logical(strcmpi(model.metNames,subs{k}).*(model.S(:,i)~=0)); %I don't understand the .* (model.S(:,i)~=0) part, it shouldn't be needed?/JG; + l = subsIndices(k); if ~isempty(subs{k}) && strcmpi(subs{k},KCATcell{2}(indx)) if KCATcell{4}(indx) > 0 coeff = min(abs(model.S(l,i))); - kcat = [kcat;KCATcell{4}(indx)/coeff]; + kCatTmp = KCATcell{4}(indx); + %to handle bad kcat values that totally dominate the modeling, we do + %not accept a lower kcat than 1 s^-1, i.e. 3600 h^-1 + %need to handle this in several places, since it is sometimes modified + %for stoichiometry + if kCatTmp < minAcceptableKCat + kCatTmp = minAcceptableKCat; + end + + kcat = [kcat;kCatTmp/coeff]; end end end end else kcat = KCATcell{4}(EC_indexes); + kcat(kcat < minAcceptableKCat) = minAcceptableKCat; end end %Return maximum value: @@ -314,34 +391,64 @@ kcat = 1E7*3600; end end + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + %Make the string matches of the ECs. This is heavy, so only do it once! + % + function EC_indexes = extract_string_matches(EC,EC_cell,wild,ECIndexIds,EcIndexIndices) + EC_indexes = []; + EC_indexesOld = []; + if wild + if (~isempty(ECIndexIds)) %In some cases the EC_cell is not from KCatCell + X = find(contains(ECIndexIds, EC)); + for j = 1:length(X) + EC_indexes = [EC_indexes,EcIndexIndices{X(j)}]; + end + else %Not optimized + for j=1:length(EC_cell) + if strfind(EC_cell{j},EC)==1 + EC_indexes = [EC_indexes,j]; + end + end + end + else + if (~isempty(ECIndexIds)) %In some cases the EC_cell is not from KCatCell + mtch = find(strcmpi(EC,ECIndexIds)); + if (~isempty(mtch)) + EC_indexes = EcIndexIndices{mtch}; + end + else %%Not optimized + if ~isempty(EC_cell) + EC_indexes = transpose(find(strcmpi(EC,EC_cell))); + end + end + end + + end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %Extract the indexes of the entries in the BRENDA data that meet the %conditions specified by the search criteria - function EC_indexes = extract_indexes(EC,EC_cell,subs_cell,orgs_cell,subs,... + function EC_indexes = extract_indexes(EC_indCellStringMatches,subs_cell,orgs_cell,subs,... substrate,organism, org_index,... - phylDist,wild) + phylDist) + + EC_indexes = EC_indCellStringMatches;%reuse so the string comparisons are not run many times - EC_indexes = []; - if wild - for j=1:length(EC_cell) - if strfind(EC_cell{j},EC)==1 - EC_indexes = [EC_indexes,j]; - end - end - else - EC_indexes = transpose(find(strcmpi(EC,EC_cell))); - end %If substrate=true then it will extract only the substrates appereances %indexes in the EC subset from the BRENDA cell array + if substrate - Subs_indexes = []; - for l = 1:length(subs) - if ~isempty(subs(l)) + if (~isempty(EC_indexes)) %optimization + Subs_indexes = []; + for l = 1:length(subs) + if (isempty(subs{l})) + break; + end Subs_indexes = horzcat(Subs_indexes,EC_indexes(strcmpi(subs(l),... subs_cell(EC_indexes)))); end + EC_indexes = Subs_indexes; end - EC_indexes = Subs_indexes; end EC_orgs = orgs_cell(EC_indexes); @@ -359,7 +466,6 @@ %on the BRENDA cell array it should search for a KEGG code for each of %these for j=1:length(EC_indexes) - %Assigns a KEGG index for those found on the KEGG struct orgs_index = find(strcmpi(orgs_cell(EC_indexes(j)),phylDist.names),1); if ~isempty(orgs_index) @@ -367,17 +473,17 @@ temp = [temp;EC_indexes(j)]; %For values related to organisms without KEGG code, then it will %look for KEGG code for the first organism with the same genus - else - k=1; - while isempty(orgs_index) && k