Source code for straindesign.compute_strain_designs

#!/usr/bin/env python3
#
# Copyright 2022 Max Planck Insitute Magdeburg
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
#
#
"""Function: computing metabolic strain designs (compute_strain_designs)"""

from contextlib import redirect_stdout, redirect_stderr
from typing import Dict, List, Tuple
import numpy as np
import logging
import json
import io
from copy import deepcopy
import logging
from cobra import Model
from cobra.manipulation import rename_genes
from straindesign import SDModule, SDSolutions, select_solver, fva, DisableLogger, SDProblem, SDMILP
from straindesign.names import *
from straindesign.networktools import   remove_ext_mets, remove_dummy_bounds, bound_blocked_or_irrevers_fva, \
                                        remove_irrelevant_genes, extend_model_gpr, extend_model_regulatory, \
                                        compress_model, compress_modules, compress_ki_ko_cost, expand_sd, filter_sd_maxcost


[docs]def compute_strain_designs(model: Model, **kwargs: dict) -> SDSolutions: """Computes strain designs for a user-defined strain design problem A number of arguments can be specified to detail the problem and influence the solution process. This function supports the computation of Minimal Cut Sets (MCS), OptKock, RobustKnock and OptCouple strain designs. It is possible to combine any of the latter ones with the MCS approach, e.g., to engineer growth coupled production, but also suppress the production of an undesired by-product. The computation can be started in two different ways. Either by specifying the computation parameters indivdually or reuse a parameters dictionary from a previous computation. CNApy stores strain design setup dics as JSON ".sd"-files that can be loaded in python and used as an input for this function. Example: sols = compute_strain_designs(model, sd_modules=[sd_module1, sd_module2], solution_approach = 'any') Args: model (cobra.Model): A metabolic model that is an instance of the cobra.Model class. The model may or may not contain genes/GPR-rules. sd_setup (dict): sd_setup should be a dictionary containing a set of parameters for strain design computation. The allowed keywords are the same listed hereafter. Therefore, *sd_setup and other arguments (except for model) must not be used together*. sd_modules ([straindesign.SDModule]): List of strain design modules that describe the sub-problems, such as the MCS-like protection or suppression of flux subspaces or the OptKnock, RobustKnock or OptCouple objective and constraints. The list of modules determines the global objective function of the strain design computation. If only SUPPRESS and PROTECT modules are used, the strain design computation is MCS-like, such that the number of interventions is minimized. If a module for one of the nested optimization approaches is used, the global objective function is retrieved from this module. The number of SUPPRESS and PROTECT modules is unrestricted and can be combined with the other modules, however only one of the modules OPTKNOCK, ROBUSKNOCK and OPTCOUPLE may be used at a time. For details, see SDModule. solver (optional (str)): (Default: same as defined in model / COBRApy) The solver that should be used for preparing and carrying out the strain design computation. Allowed values are 'cplex', 'gurobi', 'scip' and 'glpk'. max_cost (optional (int)): (Default: inf): The maximum cost threshold for interventions. Every possible intervention is associated with a cost value (1, by default). Strain designs cannot exceed the max_cost threshold. Individual intervention cost factors may be defined through ki_cost, ko_cost, gki_cost, gko_cost and reg_cost. max_solutions (optional (int)): (Default: inf) The maximum number of MILP solutions that are generated for a strain design problem. The number of returned strain designs is usually larger than the number of max_solutions, since a MILP solution is decompressed to multiple strain designs. When the compress-flag is set to 'False' the number of returned solutions is equal to max_solutions. M (optional (int)): (Default: None) If this value is specified (and non-zero, not None), the computation uses the big-M method instead of indicator constraints. Since GLPK does not support indicator constraints it uses the big-M method by default (with M=1000). M should be chosen 'sufficiently large' to avoid computational artifacts and 'sufficiently small' to avoid numerical issues. compress (optional (bool)): (Default: True) If 'True', the interative network compressor is used. gene_kos (optional (bool)): (Default: False) If 'True', strain designs are computed based on gene-knockouts instead of reaction knockouts. This parameter needs not be defined if any of ki_cost, ko_cost, gki_cost, gko_cost and reg_cost is used. By default, reactions are considered as knockout targets. ko_cost (optional (dict)): (Default: None) A dictionary of reaction identifiers and their associated knockout costs. If not specified, all reactions are treated as knockout candidates, equivalent to ko_cost = {'r1':1, 'r2':1, ...}. If a subset of reactions is listed in the dict, all other are not considered as knockout candidates. ki_cost (optional (dict)): (Default: None) A dictionary of reaction identifiers and their associated costs for addition. If not specified, all reactions are treated as knockout candidates. Reaction addition candidates must be present in the original model with the intended flux boundaries **after** insertion. Additions are treated adversely to knockouts, meaning that their exclusion from the network is not associated with any cost while their presence entails intervention costs. gko_cost (optional (dict)): (Default: None) A dictionary of gene identifiers and their associated knockout costs. To reference genes, gene IDs can be used,as well as gene names. If not specified, genes are not treated as knockout candidates. An exception is the 'gene_kos' argument. If 'gene_kos' is used, all genes are treated as knockout candidates with intervention costs of 1. This is equivalent to gko_cost = {'g1':1, 'g2':1, ...}. gki_cost (optional (dict)): (Default: None) A dictionary of gene identifiers and their associated addition costs. To reference genes, gene IDs can be used, as well as gene names. If not specified, none of the genes are treated as addition candidates. reg_cost (optional [dict]): ( Default: None) Regulatory interventions candidates can be optionally specified as a list. Thereby, the constraint marking the regulatory intervention is put as key and the associated intervention cost is used as the corresponding value. E.g., reg_cost = {'1 EX_o2_e = -1': 1, ... <other regulatory interventions>}. Instead of strings, constraints can also be passed as lists. reg_cost = {[{'EX_o2_e':1}, '=', -1]: 1, ...} solution_approach (optional (str)): ( Default: 'best') The approach used to find strain designs. Possible values are 'any', 'best' or 'populate'. 'any' is usually the fastest option, since optimality is not enforced. Hereby computed MCS are still irreducible intervention sets, however, not MCS with the fewest possible number of interventions. 'best' computes globally optimal strain designs, that is, MCS with the fewest number of interventions, OptKnock strain designs with the highest possible production rate, OptCouple strain designs with the hightest growth coupling potential etc.. 'populate' does the same as 'best', but makes use of CPLEX' and Gurobi's populate function to generate multiple strain designs. It is identical to 'best' when used with SCIP or GLPK. Attention: If 'any' used with OptKnock, for instance, the MILP may return the wild type as a possible immediately. Technically, the wiltype fulfills the criterion of maximal growth (inner objective) and maximality of the global objective is omitted by using 'any', so that carrying no product synthesis is permitted. Additional constraints can be used in the OptKnock problem to circumvent this. However, Optknock should generally be used with the 'best' option. time_limit (optional (int)): (Default: inf) The time limit in seconds for the MILP-solver. advanced, use_scenario (optional (bool)): Dummy parameters used for the CNApy interface. Returns: (SDSolutions): An object that contains all computed strain designs. If strain designs were computed as gene-interventions, the solution object contains a set of corresponding reaction-interventions that facilitate the analysis of the computed strain designs with COBRA methods. """ allowed_keys = { MODULES, SETUP, SOLVER, MAX_COST, MAX_SOLUTIONS, 'M', 'compress', 'gene_kos', KOCOST, KICOST, GKOCOST, GKICOST, REGCOST, SOLUTION_APPROACH, 'advanced', 'use_scenario', T_LIMIT } logging.info('Preparing strain design computation.') if SETUP in kwargs: if type(kwargs[SETUP]) is str: with open(kwargs[SETUP], 'r') as fs: kwargs = json.load(fs) else: kwargs = kwargs[SETUP] if MODULES in kwargs: sd_modules = kwargs.pop(MODULES) if isinstance(sd_modules, SDModule): sd_modules = [sd_modules] orig_sd_modules = [m.copy() for m in sd_modules] sd_modules = [m.copy() for m in sd_modules] if SOLVER in kwargs: kwargs[SOLVER] = select_solver(kwargs[SOLVER]) else: kwargs[SOLVER] = select_solver(None, model) if MAX_COST in kwargs: kwargs.update({MAX_COST: float(kwargs.pop(MAX_COST))}) else: kwargs.update({MAX_COST: np.inf}) if MODEL_ID in kwargs: model_id = kwargs.pop(MODEL_ID) if model_id != model.id: logging.warning( "Model IDs of provided model and setup not matching. Apparently, ",\ "the strain design setup was specified for a different model. "+\ "Errors might occur due to non-matching reaction or gene-identifiers.") if 'advanced' in kwargs: model_id = kwargs.pop('advanced') if 'use_scenario' in kwargs: model_id = kwargs.pop('use_scenario') # check all keys passed in kwargs for key, value in dict(kwargs).items(): if key not in allowed_keys: raise Exception("Key " + key + " is not supported.") if key == KOCOST: uncmp_ko_cost = value if key == KICOST: uncmp_ki_cost = value if key == GKOCOST: uncmp_gko_cost = value if key == GKICOST: uncmp_gki_cost = value if key == REGCOST: uncmp_reg_cost = value if (GKOCOST in kwargs or GKICOST in kwargs or ('gene_kos' in kwargs and kwargs['gene_kos'])) and hasattr(model, 'genes') and model.genes: kwargs['gene_kos'] = True used_g_ids = set(kwargs[GKOCOST] if GKOCOST in kwargs and kwargs[GKOCOST] else set()) used_g_ids.update(set(kwargs[GKICOST] if GKICOST in kwargs and kwargs[GKICOST] else set())) used_g_ids.update(set(kwargs[REGCOST] if REGCOST in kwargs and kwargs[REGCOST] else set())) # genes must not begin with number, put a 'g' in front of genes that start with a number if any([True for g in model.genes if g.id[0].isdigit()]): logging.warning("Gene IDs must not start with a digit. Inserting prefix 'g' where necessary.") rename_genes(model, {g.id: 'g' + g.id for g in model.genes if g.id[0].isdigit()}) if np.all([len(g.name) for g in model.genes]) and (np.any([g.name in used_g_ids for g in model.genes]) or not used_g_ids): has_gene_names = True else: has_gene_names = False if has_gene_names and any([True for g in model.genes if g.name[0].isdigit()]): logging.warning("Gene names must not start with a digit. Inserting prefix 'g' where necessary.") for g, v in {g.id: 'g' + g.name for g in model.genes if g.name[0].isdigit()}.items(): model.genes.get_by_id(g).name = v if GKOCOST not in kwargs or not kwargs[GKOCOST]: if has_gene_names: # if gene names are defined, use them instead of ids uncmp_gko_cost = {k: 1.0 for k in model.genes.list_attr('name')} else: uncmp_gko_cost = {k: 1.0 for k in model.genes.list_attr('id')} if GKICOST not in kwargs or not kwargs[GKICOST]: uncmp_gki_cost = {} else: kwargs['gene_kos'] = False has_gene_names = False if KOCOST not in kwargs and not kwargs['gene_kos']: uncmp_ko_cost = {k: 1.0 for k in model.reactions.list_attr('id')} elif KOCOST not in kwargs or not kwargs[KOCOST]: uncmp_ko_cost = {} if KICOST not in kwargs or not kwargs[KICOST]: uncmp_ki_cost = {} if REGCOST not in kwargs or not kwargs[REGCOST]: uncmp_reg_cost = {} # check that at most one bilevel module is provided bilvl_modules = [i for i,m in enumerate(sd_modules) \ if m[MODULE_TYPE] in [OPTKNOCK,ROBUSTKNOCK,OPTCOUPLE]] if len(bilvl_modules) > 1: raise Exception("Only one of the module types 'OptKnock', 'RobustKnock' and 'OptCouple' can be defined per "\ "strain design setup.") logging.info(' Using ' + kwargs[SOLVER] + ' for solving LPs during preprocessing.') with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()), DisableLogger(): # suppress standard output from copying model orig_model = model model = model.copy() uncmp_model = model.copy() orig_ko_cost = deepcopy(uncmp_ko_cost) orig_ki_cost = deepcopy(uncmp_ki_cost) orig_reg_cost = deepcopy(uncmp_reg_cost) if 'compress' not in kwargs: kwargs['compress'] = True if kwargs['gene_kos']: orig_gko_cost = uncmp_gko_cost orig_gki_cost = uncmp_gki_cost # ensure that gene and reaction kos/kis do not overlap g_itv = {g for g in list(uncmp_gko_cost.keys()) + list(uncmp_gki_cost.keys())} r_itv = {r for r in list(uncmp_ko_cost.keys()) + list(uncmp_ki_cost.keys())} if np.any([np.any([True for g in uncmp_model.reactions.get_by_id(r).genes if g in g_itv]) for r in r_itv]) or \ np.any(set(uncmp_gko_cost.keys()).intersection(set(uncmp_gki_cost.keys()))) or \ np.any(set(uncmp_ko_cost.keys()).intersection(set(uncmp_ki_cost.keys()))): raise Exception('Specified gene and reaction knock-out/-in costs contain overlap. '\ 'Make sure that metabolic interventions are enabled either through reaction or '\ 'through gene interventions and are defined either as knock-ins or as knock-outs.') # 1) Preprocess Model with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()), DisableLogger(): # suppress standard output from copying model cmp_model = uncmp_model.copy() # remove external metabolites remove_ext_mets(cmp_model) # replace model bounds with +/- inf if above a certain threshold remove_dummy_bounds(model) # FVAs to identify blocked, irreversible and essential reactions, as well as non-bounding bounds logging.info(' FVA to identify blocked reactions and irreversibilities.') bound_blocked_or_irrevers_fva(model, solver=kwargs[SOLVER]) logging.info(' FVA(s) to identify essential reactions.') essential_reacs = set() for m in sd_modules: if m[MODULE_TYPE] != SUPPRESS: # Essential reactions can only be determined from desired # or opt-/robustknock modules flux_limits = fva(cmp_model, solver=kwargs[SOLVER], constraints=m[CONSTRAINTS]) for (reac_id, limits) in flux_limits.iterrows(): if np.min(abs(limits)) > 1e-10 and np.prod(np.sign(limits)) > 0: # find essential essential_reacs.add(reac_id) # remove ko-costs (and thus knockability) of essential reactions [uncmp_ko_cost.pop(er) for er in essential_reacs if er in uncmp_ko_cost] # If computation of gene-gased intervention strategies, (optionally) compress gpr are rules and extend stoichimetric network with genes if kwargs['gene_kos']: if kwargs['compress'] is True or kwargs['compress'] is None: num_genes = len(cmp_model.genes) num_gpr = len([True for r in model.reactions if r.gene_reaction_rule]) logging.info('Preprocessing GPR rules (' + str(num_genes) + ' genes, ' + str(num_gpr) + ' gpr rules).') # removing irrelevant genes will also remove essential reactions from the list of knockable genes uncmp_gko_cost = remove_irrelevant_genes(cmp_model, essential_reacs, uncmp_gki_cost, uncmp_gko_cost) if len(cmp_model.genes) < num_genes or len([True for r in model.reactions if r.gene_reaction_rule]) < num_gpr: num_genes = len(cmp_model.genes) num_gpr = len([True for r in cmp_model.reactions if r.gene_reaction_rule]) logging.info(' Simplifyied to '+str(num_genes)+' genes and '+\ str(num_gpr)+' gpr rules.') logging.info(' Extending metabolic network with gpr associations.') reac_map = extend_model_gpr(cmp_model, has_gene_names) for i, m in enumerate(sd_modules): for p in [CONSTRAINTS, INNER_OBJECTIVE, OUTER_OBJECTIVE, PROD_ID]: if p in m and m[p] is not None: if p == CONSTRAINTS: for c in m[p]: for k in list(c[0].keys()): v = c[0].pop(k) for n, w in reac_map[k].items(): c[0][n] = v * w if p in [INNER_OBJECTIVE, OUTER_OBJECTIVE, PROD_ID]: for k in list(m[p].keys()): v = m[p].pop(k) for n, w in reac_map[k].items(): m[p][n] = v * w uncmp_ko_cost.update(uncmp_gko_cost) uncmp_ki_cost.update(uncmp_gki_cost) uncmp_ko_cost.update(extend_model_regulatory(cmp_model, uncmp_reg_cost)) cmp_ko_cost = uncmp_ko_cost cmp_ki_cost = uncmp_ki_cost # Compress model if kwargs['compress'] is True or kwargs['compress'] is None: # If compression is activated (or not defined) logging.info('Compressing Network (' + str(len(cmp_model.reactions)) + ' reactions).') # compress network by lumping sequential and parallel reactions alternatingly. # Exclude reactions named in strain design modules from parallel compression no_par_compress_reacs = set() for m in sd_modules: for p in [CONSTRAINTS, INNER_OBJECTIVE, OUTER_OBJECTIVE, PROD_ID]: if p in m and m[p] is not None: param = m[p] if p == CONSTRAINTS: for c in param: for k in c[0].keys(): no_par_compress_reacs.add(k) if p in [INNER_OBJECTIVE, OUTER_OBJECTIVE, PROD_ID]: for k in param.keys(): no_par_compress_reacs.add(k) cmp_mapReac = compress_model(cmp_model, no_par_compress_reacs) # compress information in strain design modules sd_modules = compress_modules(sd_modules, cmp_mapReac) # compress ko_cost and ki_cost cmp_ko_cost, cmp_ki_cost, cmp_mapReac = compress_ki_ko_cost(cmp_ko_cost, cmp_ki_cost, cmp_mapReac) else: cmp_mapReac = [] # An FVA to identify essentials before building and launching MILP (not sure if this has an effect) logging.info(' FVA(s) in compressed model to identify essential reactions.') essential_reacs = set() for m in sd_modules: if m[MODULE_TYPE] != SUPPRESS: # Essential reactions can only be determined from desired # or opt-/robustknock modules flux_limits = fva(cmp_model, solver=kwargs[SOLVER], constraints=m[CONSTRAINTS]) for (reac_id, limits) in flux_limits.iterrows(): if np.min(abs(limits)) > 1e-10 and np.prod(np.sign(limits)) > 0: # find essential essential_reacs.add(reac_id) # remove ko-costs (and thus knockability) of essential reactions [cmp_ko_cost.pop(er) for er in essential_reacs if er in cmp_ko_cost] essential_kis = set(cmp_ki_cost[er] for er in essential_reacs if er in cmp_ki_cost) # Build MILP kwargs1 = kwargs kwargs1[KOCOST] = cmp_ko_cost kwargs1[KICOST] = cmp_ki_cost kwargs1['essential_kis'] = essential_kis kwargs1.pop('compress') if GKOCOST in kwargs1: kwargs1.pop(GKOCOST) if GKICOST in kwargs1: kwargs1.pop(GKICOST) if REGCOST in kwargs1: kwargs1.pop(REGCOST) kwargs_milp = {k: v for k, v in kwargs.items() if k in [SOLVER, MAX_COST, 'M']} kwargs_milp.update({KOCOST: cmp_ko_cost}) kwargs_milp.update({KICOST: cmp_ki_cost}) kwargs_milp.update({'essential_kis': essential_kis}) logging.info("Finished preprocessing:") logging.info(" Model size: " + str(len(cmp_model.reactions)) + " reactions, " + str(len(cmp_model.metabolites)) + " metabolites") logging.info(" " + str(len(cmp_ko_cost) + len(cmp_ki_cost) - len(essential_kis)) + " targetable reactions") sd_milp = SDMILP(cmp_model, sd_modules, **kwargs_milp) kwargs_computation = {} if MAX_SOLUTIONS in kwargs: kwargs_computation.update({MAX_SOLUTIONS: float(kwargs.pop(MAX_SOLUTIONS))}) if T_LIMIT in kwargs: kwargs_computation.update({T_LIMIT: float(kwargs.pop(T_LIMIT))}) kwargs_computation.update({'show_no_ki': True}) # solution approach if SOLUTION_APPROACH in kwargs: solution_approach = kwargs.pop(SOLUTION_APPROACH) else: solution_approach = BEST # solve MILP if solution_approach == ANY: cmp_sd_solution = sd_milp.compute(**kwargs_computation) elif solution_approach == BEST: cmp_sd_solution = sd_milp.compute_optimal(**kwargs_computation) elif solution_approach == POPULATE: cmp_sd_solution = sd_milp.enumerate(**kwargs_computation) logging.info(' Decompressing.') if cmp_sd_solution.status in [OPTIMAL, TIME_LIMIT_W_SOL]: sd = expand_sd(cmp_sd_solution.get_reaction_sd_mark_no_ki(), cmp_mapReac) sd = filter_sd_maxcost(sd, kwargs[MAX_COST], uncmp_ko_cost, uncmp_ki_cost) sd = postprocess_reg_sd(uncmp_reg_cost, sd) else: sd = [] setup = deepcopy(cmp_sd_solution.sd_setup) setup.update({MODULES: orig_sd_modules, KOCOST: orig_ko_cost, KICOST: orig_ki_cost, REGCOST: orig_reg_cost}) if kwargs['gene_kos']: setup.update({GKOCOST: orig_gko_cost, GKICOST: orig_gki_cost}) sd_solutions = SDSolutions(orig_model, sd, cmp_sd_solution.status, setup) logging.info(str(len(sd)) + ' solutions found.') return sd_solutions
[docs]def postprocess_reg_sd(reg_cost, sd): """Postprocess regulatory interventions Mark regulatory interventions with true or false""" for s in sd: for k, v in reg_cost.items(): if k in s: s.pop(k) s.update({v['str']: True}) else: s.update({v['str']: False}) return sd