Source code for modules.gemtractor.gemtractor

# This file is part of the GEMtractor
# Copyright (C) 2019 Martin Scharm <>
# The GEMtractor is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# The GEMtractor is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <>.

import logging
import math
import re

import pyparsing as pp
from libsbml import FbcAssociation_parseFbcInfixAssociation, SBMLReader

from .network.gene import Gene
from .network.genecomplex import GeneComplex
from import Network
from .utils import Utils
from .exceptions import BreakLoops, InvalidGeneExpression

[docs]class GEMtractor: """Main class for the GEMtractor It reads an SBML file, trims entities, and extracts the encoded network. :param sbml_file: path to the SBML file :type sbml_file: str """ def __init__(self, sbml_file): self.__GENE_ASSOCIATION_PATTERN = re.compile(r".*GENE_ASSOCIATION:([^<]+) *<.*", re.DOTALL) self.__GENE_LIST_PATTERN = re.compile(r".*GENE_LIST: *([^ <][^<]*)<.*", re.DOTALL) self.__EXPRESSION_PARSER = self.__get_expression_parser () self.__logger = logging.getLogger(__name__) self.__reaction_gene_map = {} self.__sbml_file = sbml_file self.__fbc_plugin = None self.__logger.debug("reading sbml file " + self.__sbml_file) self.sbml = SBMLReader().readSBML(self.__sbml_file) if self.sbml.getNumErrors() > 0: e = [] for i in range (0, self.sbml.getNumErrors()): e.append (self.sbml.getError(i).getMessage()) raise IOError ("model seems to be invalid: " + str (e)) def __get_expression_parser (self): """ get a parser to parse gene-association strings of reactions :return: the expression parser :rtype: `pyparsing:ParserElement <>`_ """ variables = pp.Word(pp.alphanums + "_-.") condition = pp.Group(variables) return pp.infixNotation(condition,[("and", 2, pp.opAssoc.LEFT, ),("or", 2, pp.opAssoc.LEFT, ),])
[docs] def _parse_expression (self, expression): """ parse a gene-association expression uses the expression parser from :func:`_GEMtractor__get_expression_parser` :param expression: the gene-association expression :type expression: str :return: the parse result :rtype: `pyparsing:ParseResults <>`_ """ try: return self.__EXPRESSION_PARSER.parseString(expression.lower (), True) except pp.ParseException as e: raise InvalidGeneExpression ("cannot parse expression: >>" + expression + "<< -- " + getattr(e, 'message', repr(e)))
[docs] def _unfold_complex_expression (self, parseresult): """ unfold a gene-association parse result takes a parse result and unfolds it to a list of alternative gene complexes, which can catalyze a certain reaction :param parseresult: the result of the expression parser :type parseresult: `pyparsing:ParseResults <>`_ :return: the list of gene-complexes catalyzing :rtype: list of :class:`.network.genecomplex.GeneComplex` """ if isinstance(parseresult, pp.ParseResults): if len(parseresult) == 1: return self._unfold_complex_expression (parseresult[0]) if len(parseresult) > 1 and len(parseresult) % 2 == 1: # this is a chain of ANDs/ORs # make sure it's only ANDs or only ORs for i in range (0, math.floor (len(parseresult) / 2) - 1): if parseresult[2*i+1] != parseresult[2*i+3]: # if it's mixed we'll raise an error self.__logger.critical('and/or mixed in expression: ' + str(parseresult[1])) raise NotImplementedError ('and/or mixed in expression: ' + str(parseresult[1])) logic = None if parseresult[1] == "and": logic = True elif parseresult[1] == "or": logic = False else: self.__logger.critical('neither and nor or -> do not understand: ' + str(parseresult[1])) raise NotImplementedError ('neither and nor or -> do not understand: ' + str(parseresult[1])) # parse the first subterm term_combination = self._unfold_complex_expression (parseresult[0]) # from the second subterm, it depends on the logic-connection on how to combine subterms for i in range(2, len(parseresult), 2): if logic: # it's ANDed -> so every previous item needs to be connected to every next item newterms = self._unfold_complex_expression (parseresult[i]) # ~ tmp = [] for a in term_combination: for b in newterms: a.add_genes (b) # ~ tmp.append (str(a)+" and "+str (b)) # ~ term_combination = tmp else: # just add them to the list as alternatives term_combination += self._unfold_complex_expression (parseresult[i]) return term_combination else: self.__logger.critical('unexpected expression: ' + str(parseresult)) raise NotImplementedError ('unexpected expression: ' + str(parseresult)) else: return [GeneComplex (Gene (parseresult))]
[docs] def _extract_genes_from_sbml_notes (self, annotation, default): """ extract the genes from a free-text sbml note expects the gene associations as .. code-block:: xml <p>GENE_ASSOCIATION: a and (b or c)</p> :param annotation: the annotation string :param default: the default to return if no gene-associations were found :type annotation: str :type default: str :return: the gene-associations :rtype: string """ m = re.match (self.__GENE_ASSOCIATION_PATTERN, annotation) if m: g = (1).strip() if len (g) > 0: return g return default
[docs] def _overwrite_genes_in_sbml_notes (self, new_genes, reaction): """ set the gene-associations for a note in an sbml reaction overwrites it, if it was set already :param new_genes: the new gene-associations :param reaction: the sbml reaction :type new_genes: str :type reaction: `libsbml:Reaction <>`_ """ m = re.match (self.__GENE_ASSOCIATION_PATTERN, reaction.getNotesString()) if m: reaction.setNotes (reaction.getNotesString().replace ( (1), new_genes)) else: self.__logger.debug('no gene notes to update: ' + reaction.getId ())
# TODO # also update the GENE_LIST using __GENE_LIST_PATTERN # but for this we need the list of genes here (not only the logic expression)
[docs] def get_gene_product_annotations (self, gene): """ get the annotations of a gene product if the document is annotated using the `FBC package <>`_ there is good chance we'll find more information about a gene in the gene-product's annotations :param gene: the label gene product of interest :type gene: str :return: the annotations of the gene-product labeled `gene` or None if there are no such annotations :rtype: xml str """ if self.__fbc_plugin is not None: gp = self.__fbc_plugin.getGeneProductByLabel (gene) if gp is not None: return self.__fbc_plugin.getGeneProductByLabel (gene).getAnnotationString() return None
[docs] def get_reaction_annotations (self, reactionid): """ get the annotations of an sbml reaction :param reactionid: the identifier of the reaction in the sbml document :type reactionid: str :return: the annotations of that reaction :rtype: xml str """ return self.sbml.getModel().getReaction (reactionid).getAnnotationString ()
[docs] def get_sbml (self, filter_species = [], filter_reactions = [], filter_genes = [], filter_gene_complexes = [], remove_reaction_enzymes_removed = True, remove_ghost_species = False, discard_fake_enzymes = False, remove_reaction_missing_species = False, removing_enzyme_removes_complex = True): """ Get a filtered SBML document from the model file this parses the SBML file, applies the trimming according to the arguments, and returns the trimmed model :param filter_species: species identifiers to get rid of :param filter_reactions: reaction identifiers to get rid of :param filter_genes: enzyme identifiers to get rid of :param filter_gene_complexes: enzyme-complex identifiers to get rid of, every list-item should be of format: 'A + B + gene42' :param remove_reaction_enzymes_removed: should we remove a reaction if all it's genes were removed? :param remove_ghost_species: should species be removed, that do not participate in any reaction anymore - even though they might be required in other entities? :param discard_fake_enzymes: should fake enzymes (implicitly assumes enzymes, if no enzymes are annotated to a reaction) be removed? :param remove_reaction_missing_species: remove a reaction if one of the participating genes was removed? :param removing_enzyme_removes_complex: if an enzyme is removed, should also all enzyme complexes be removed in which it participates? :type filter_species: list of str :type filter_reactions: list of str :type filter_genes: list of str :type filter_gene_complexes: list of str :type remove_reaction_enzymes_removed: bool :type remove_ghost_species: bool :type discard_fake_enzymes: bool :type remove_reaction_missing_species: bool :type removing_enzyme_removes_complex: bool :return: the SBML document :rtype: `libsbml:SBMLDocument <>`_ """ self.__logger.debug("processing sbml model from " + self.__sbml_file) model = self.sbml.getModel() name = model.getName () if name is None or len (name) < 1: name = model.getId() model.setId (model.getId() + "_gemtracted_ReactionNetwork") model.setName ("GEMtracted ReactionNetwork of " + name)"got proper sbml model") self.__fbc_plugin = model.getPlugin ("fbc") self.__logger.debug("append a note") Utils.add_model_note (model, filter_species, filter_reactions, filter_genes, filter_gene_complexes, remove_reaction_enzymes_removed, remove_ghost_species, discard_fake_enzymes, remove_reaction_missing_species, removing_enzyme_removes_complex) if filter_species is None: filter_species = [] if filter_reactions is None: filter_reactions = [] if filter_genes is None: filter_genes = [] if filter_gene_complexes is None: filter_gene_complexes = [] if len(filter_species) > 0 or len(filter_reactions) > 0 or len(filter_genes) > 0 or len(filter_gene_complexes) > 0 or discard_fake_enzymes: try: #TODO dc modified? self.__logger.debug("filtering things") for n in range (model.getNumReactions () - 1, -1, -1): reaction = model.getReaction (n) if filter_reactions is not None and reaction.getId () in filter_reactions: model.removeReaction (n) continue if filter_species is not None: for sn in range (reaction.getNumReactants() -1, -1, -1): s = reaction.getReactant(sn).getSpecies() if s in filter_species: if remove_reaction_missing_species: model.removeReaction (n) raise BreakLoops reaction.removeReactant (sn) for sn in range (reaction.getNumProducts() -1, -1, -1): s = reaction.getProduct(sn).getSpecies() if s in filter_species: if remove_reaction_missing_species: model.removeReaction (n) raise BreakLoops reaction.removeProduct (sn) for sn in range (reaction.getNumModifiers() -1, -1, -1): s = reaction.getModifier(sn).getSpecies() if s in filter_species: if remove_reaction_missing_species: model.removeReaction (n) raise BreakLoops reaction.removeModifier (sn) if filter_genes is not None: current_genes = self._get_genes (reaction) self.__logger.debug("current genes: " + self._implode_genes (current_genes) + " - reaction: " + reaction.getId ()) if len(current_genes) < 1:"did not find genes in reaction " + reaction.getId ()) raise NotImplementedError ("did not find genes in reaction " + reaction.getId ()) if discard_fake_enzymes and len(current_genes) == 1 and "reaction_" in current_genes[0].get_id (): model.removeReaction (n) continue # if len(current_genes) == 1 and current_genes[0] == reaction.getId (): final_genes = [] for g in current_genes: # ~ print (g.get_id()) # ~ print (g.genes) if g.get_id () not in filter_genes and g.get_id () not in filter_gene_complexes and not (removing_enzyme_removes_complex and g.contains_one_of (filter_genes)): # ~ print (g.get_id() + " will be in model") final_genes.append (g) if len (final_genes) < 1: if remove_reaction_enzymes_removed: model.removeReaction (n) continue else: final_genes = [Gene (reaction.getId ())] # should we update the genes in the model? if (len (final_genes) != len (current_genes)): self._set_genes_in_sbml (final_genes, reaction) self.__reaction_gene_map[reaction.getId ()] = final_genes if reaction.getNumReactants() + reaction.getNumModifiers() + reaction.getNumProducts() == 0: model.removeReaction (n) if len(filter_species) > 0 and remove_ghost_species: for n in range (model.getNumSpecies () - 1, -1, -1): species = model.getSpecies (n) if species.getId () in filter_species: model.removeSpecies (n) except BreakLoops: pass return self.sbml
[docs] def _set_genes_in_sbml (self, genes, reaction): """ set the genes of a reaction in an sbml model tries to set the genes using the `FBC package <>`_, if supported, otherwise sets the genes in the reaction's notes. if genes already exist the will be overwritten... :param genes: the new list of gene associations :param reaction: the reaction to annotate :type genes: list of :class:`.network.genecomplex.GeneComplex` :type reaction: `libsbml:Reaction <>`_ """ rfbc = reaction.getPlugin ("fbc") if rfbc is not None: gpa = rfbc.getGeneProductAssociation() if gpa is not None: g = self._implode_genes (genes) gpa.setAssociation(FbcAssociation_parseFbcInfixAssociation (g, self.__fbc_plugin)) else: self.__logger.debug('no fbc to update: ' + reaction.getId ()) self._overwrite_genes_in_sbml_notes (self._implode_genes (genes), reaction)
[docs] def _implode_genes (self, genes): """ implode a list of genes into a proper logical expression basically joins the list with `or`, making sure every item is enclosed in brackets :param genes: the list of optional genes :type genes: list of :class:`.network.genecomplex.GeneComplex` :return: the logical expression (genes joined using 'or') :rtype: str """ r = "(" for g in genes: r += str (g.to_sbml_string ()) + " or " return r[:-4] + ")"
[docs] def _get_genes (self, reaction): """ get the genes associated to a reaction Will cache the gene associations. If there is nothing in cache, it will run :func:`_GEMtractor__find_genes`, :func:`_parse_expression`, and :func:`_unfold_complex_expression` to find them. :param reaction: the SBML S-Base reaction :type reaction: `libsbml:Reaction <>`_ :return: list of GeneComplexes catalyzing the reaction :rtype: list of :class:`.network.genecomplex.GeneComplex` """ if reaction.getId () not in self.__reaction_gene_map: self.__reaction_gene_map[reaction.getId ()] = self._unfold_complex_expression(self._parse_expression(self.__find_genes (reaction))) return self.__reaction_gene_map[reaction.getId ()]
def __find_genes (self, reaction): """ Look for genes associated to a reaction. Will check if there are annotations using the `FBC package <>`_, otherwise it will evaluate the reactions' notes. If there are still no valid gene associations, the function assumes there is an undocumented catalyst and falls back to a gene with the reaction's identifier (prefixed with 'reaction\_'). :param reaction: the SBML S-Base reaction :type reaction: `libsbml:Reaction <>`_ :return: the gene associations :rtype: str """ rfbc = reaction.getPlugin ("fbc") if rfbc is not None: gpa = rfbc.getGeneProductAssociation() if gpa is not None: return gpa.getAssociation().toInfix() else: self.__logger.debug('no association: ' + reaction.getId ()) else: self.__logger.debug('no fbc: ' + reaction.getId ()) return self._extract_genes_from_sbml_notes (reaction.getNotesString(), "reaction_" + reaction.getId ())
[docs] def extract_network_from_sbml (self): """ Extract the Network from the SBML model Will go through the SBML file and convert the (remaining) entities into our own network structure. :return: the network (after optional trimming) :rtype: :class:`` """ if self.sbml.getNumErrors() > 0: raise IOError ("model seems to be invalid") model = self.sbml.getModel() ("extracting network from " + model.getId ()) network = Network () species = {} for n in range (0, model.getNumSpecies()): s = model.getSpecies (n) species[s.getId ()] = network.add_species (s.getId (), s.getName ()) for n in range (0, model.getNumReactions()): if n % 100 == 0: ("processing reaction " + str (n)) reaction = model.getReaction(n) # TODO: reversible? #r = Reaction (reaction.getId (), reaction.getName ()) r = network.add_reaction (reaction.getId (), reaction.getName ()) if reaction.isSetReversible (): r.reversible = reaction.getReversible () current_genes = self._get_genes (reaction) self.__logger.debug("current genes: " + self._implode_genes (current_genes) + " - reaction: " + reaction.getId ()) if len(current_genes) < 1: self.__logger.debug("did not find genes in reaction " + reaction.getId ()) raise NotImplementedError ("did not find genes in reaction " + reaction.getId ()) network.add_genes (r, current_genes) for sn in range (0, reaction.getNumReactants()): s = reaction.getReactant(sn).getSpecies() r.add_input (species[s]) for sn in range (0, reaction.getNumProducts()): s = reaction.getProduct(sn).getSpecies() r.add_output (species[s]) ("extracted network") return network