diff --git a/met4j-toolbox/src/main/java/fr/inrae/toulouse/metexplore/met4j_toolbox/networkAnalysis/SideCompoundsScan.java b/met4j-toolbox/src/main/java/fr/inrae/toulouse/metexplore/met4j_toolbox/networkAnalysis/SideCompoundsScan.java index 6744936d92d3ca216305547e8ce898d1081af455..8b6a27b0ec0e87847265728a624faf76c7dc59d0 100644 --- a/met4j-toolbox/src/main/java/fr/inrae/toulouse/metexplore/met4j_toolbox/networkAnalysis/SideCompoundsScan.java +++ b/met4j-toolbox/src/main/java/fr/inrae/toulouse/metexplore/met4j_toolbox/networkAnalysis/SideCompoundsScan.java @@ -4,19 +4,22 @@ import fr.inrae.toulouse.metexplore.met4j_chemUtils.FormulaParser; import fr.inrae.toulouse.metexplore.met4j_core.biodata.BioMetabolite; import fr.inrae.toulouse.metexplore.met4j_core.biodata.BioNetwork; import fr.inrae.toulouse.metexplore.met4j_graph.computation.connect.weighting.DefaultWeightPolicy; +import fr.inrae.toulouse.metexplore.met4j_graph.computation.transform.VertexContraction; import fr.inrae.toulouse.metexplore.met4j_graph.core.WeightingPolicy; import fr.inrae.toulouse.metexplore.met4j_graph.core.compound.CompoundGraph; import fr.inrae.toulouse.metexplore.met4j_graph.io.Bionetwork2BioGraph; import fr.inrae.toulouse.metexplore.met4j_io.jsbml.reader.JsbmlReader; import fr.inrae.toulouse.metexplore.met4j_io.jsbml.reader.Met4jSbmlReaderException; -import fr.inrae.toulouse.metexplore.met4j_core.utils.StringUtils; import fr.inrae.toulouse.metexplore.met4j_toolbox.generic.AbstractMet4jApplication; import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; import org.kohsuke.args4j.Option; import java.io.FileWriter; import java.io.IOException; -import java.util.regex.Pattern; +import java.util.HashMap; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; /** * @@ -50,6 +53,11 @@ public class SideCompoundsScan extends AbstractMet4jApplication { @Option(name = "-er", aliases = {"--edgeRedundancy"}, usage = "flag as side compound any compound with a number of redundancy in incident edges (parallel edges connecting to the same neighbor) above the given threshold") public double parallelEdge = Double.NaN; + enum strategy {by_name,by_id} + @Option(name = "-m", aliases = {"--merge"}, usage = "Degree is shared between compounds in different compartments. " + + "Use names if consistent and unambiguous across compartments, or identifiers if compartment suffix is present (id in form \"xxx_y\" with xxx as base identifier and y as compartment label).") + public strategy mergingStrat = null; + public static void main(String[] args) throws IOException, Met4jSbmlReaderException { @@ -86,12 +94,32 @@ public class SideCompoundsScan extends AbstractMet4jApplication { //perform scan //------------ System.err.println("Scaning..."); + + //if merging compartment + Map<String, Integer> mergedDegree = new HashMap<>(); + Boolean merge = (mergingStrat!=null); + Function<BioMetabolite,String> getSharedId = BioMetabolite::getName; + if(merge){ + if(mergingStrat.equals(strategy.by_id)) getSharedId = (new VertexContraction.MapByIdSubString("^(\\w+)_\\w$"))::commonField; + + mergedDegree = graph.vertexSet().stream().collect( + Collectors.groupingBy( + getSharedId, + Collectors.summingInt(v -> graph.degreeOf(v)) + ) + ); + } + //degree statistics DescriptiveStatistics degreeStats = new DescriptiveStatistics(); double dt = degree; if (!Double.isNaN(degreePrecentile)) { for (BioMetabolite v : graph.vertexSet()) { - degreeStats.addValue(graph.degreeOf(v)); + if (merge){ + degreeStats.addValue(mergedDegree.get(getSharedId.apply(v))); + }else{ + degreeStats.addValue(graph.degreeOf(v)); + } } dt = degreeStats.getPercentile(degreePrecentile); } @@ -120,7 +148,7 @@ public class SideCompoundsScan extends AbstractMet4jApplication { StringBuffer l = new StringBuffer(v.getId()); if (reportValue) l.append("\t" + v.getName()); - int d = graph.degreeOf(v); + int d = merge ? mergedDegree.get(getSharedId.apply(v)) : graph.degreeOf(v); boolean sideFromDegree = (d >= degree); if (sideFromDegree) side = true; if (reportValue) l.append("\t" + d); @@ -137,7 +165,7 @@ public class SideCompoundsScan extends AbstractMet4jApplication { if (flagInorganic || flagNoFormula) { String formula = v.getChemicalFormula(); String inorganic = "?"; - String validForumla = "true"; + String validFormula = "true"; try{ FormulaParser fp = new FormulaParser(formula); if(flagInorganic){ @@ -150,14 +178,14 @@ public class SideCompoundsScan extends AbstractMet4jApplication { } }catch(IllegalArgumentException e){ if(flagNoFormula){ - validForumla = "false"; + validFormula = "false"; side = true; } } if (reportValue){ if(flagInorganic) l.append("\t" + inorganic); - if(flagNoFormula) l.append("\t" + validForumla); + if(flagNoFormula) l.append("\t" + validFormula); } } @@ -185,7 +213,7 @@ public class SideCompoundsScan extends AbstractMet4jApplication { public String getLongDescription() { return this.getShortDescription() + "\n" + "Side compounds are metabolites of small relevance for topological analysis. Their definition can be quite subjective and varies between sources.\n" + - "Side compounds tends to be ubiquitous and not specific to a particular biochemical or physiological process.\n" + + "Side compounds tend to be ubiquitous and not specific to a particular biochemical or physiological process." + "Compounds usually considered as side compounds include water, atp or carbon dioxide. By being involved in many reactions and thus connected to many compounds, " + "they tend to significantly lower the average shortest path distances beyond expected metabolic relatedness.\n" + "This tool attempts to propose a list of side compounds according to specific criteria: \n" +