Source code for simplicity.phenotype.consensus

# This file is part of SIMPLICITY
# Copyright (C) 2025 Pietro Gerletti
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.

# -*- coding: utf-8 -*-
import simplicity.phenotype.weight as w 
import simplicity.evolution.reference as ref 
import numpy as np

reference = ref.get_reference()

[docs] def get_seq_weights(data,t_sim): """ preprocess lineage data to calculate weighted consensus. calculate weight for sequence at time t_sim Parameters ---------- data : list list of list containing lineages data. data contains lineage_name, sequence, n_infected_t, t t_sim : float time of the simulation at which we evalute the weights. Returns ------- seq_for_consensus : list [sequence, #_infected_t, w_t(t_sim)]. """ # get parameters for w_t(t_sim) params = w.w_t_params() k_e = params[0] k_a = params[1] seq_for_consensus = [] # loop over data, compute weight and add entry to list for lst in data: seq_for_consensus.append([ lst[0], lst[1], w.weights(lst[2],t_sim, k_e, k_a, w.t_max) ]) return seq_for_consensus
[docs] def build_weighted_consensus_matrix(data): ''' Parameters ---------- data : TYPE DESCRIPTION. Returns ------- matrix : TYPE DESCRIPTION. bases : TYPE DESCRIPTION. unique_positions : TYPE DESCRIPTION. ''' # Extract unique positions unique_positions= sorted(set(num for entry in data for num, _ in entry[0])) num_index = {num: i for i, num in enumerate(unique_positions)} bases = ['A', 'T', 'C', 'G'] letter_index = {letter: i for i, letter in enumerate(bases)} # Initialize the matrix matrix = np.zeros((len(bases), len(unique_positions)), dtype=float) # Populate the matrix, accounting for missing positions for seq, n_infected, weight in data: # Determine missing positions existing_positions = set(num for num, _ in seq) missing_positions = set(unique_positions) - existing_positions # Update matrix for existing positions for num, char in seq: row = letter_index[char] col = num_index[num] matrix[row, col] += n_infected * weight # Update matrix for missing positions for num in missing_positions: char = reference[num] # fetch base from reference genome row = letter_index[char] col = num_index[num] matrix[row, col] += n_infected * weight return matrix, bases, unique_positions
[docs] def weighted_consensus(matrix, positions): ''' calculate the weighted consensus sequence between individuals in the population ''' bases = ['A', 'T', 'C', 'G'] consensus = [] # Iterate through each column in the matrix to find the base with the highest count for col_idx, pos in enumerate(positions): # Use argmax to find the index of the maximum count in the column max_base_idx = np.argmax(matrix[:, col_idx]) # Use the index to get the corresponding letter max_base= bases[max_base_idx] # if position is diffrent from wt, append it. (we encode sequences as only positions that differ from wt) if max_base != reference[pos]: # Append the position (num) and the letter with the highest count consensus.append([pos, max_base]) return consensus
[docs] def get_consensus(data,t): data = get_seq_weights(data,t) matrix, bases, positions= build_weighted_consensus_matrix(data) return weighted_consensus(matrix, positions)
# ## example use # data = [ # [[[10,'A'],[45,'G']],10,10], # [[[13,'A']], 5,10], # [[[10,'C'],[45,'G']],10,10] # ] # # data contains sequence, n_infected_t, t # data = get_seq_weights(data,100) # # Build the ndarray # matrix, bases, positions= build_weighted_consensus_matrix(data) # # Call the function with the matrix and unique numbers # positions_with_max_letter = weighted_consensus(matrix, positions) # # # To display the result similarly to a pandas DataFrame for visualization: # # import pandas as pd # # df = pd.DataFrame(matrix, index=bases, columns=positions) # # print(df) # # Display the result # print(positions_with_max_letter) # import simplicity.evolution.decoder as dec # example_consensus_decoded = dec.decode_genome(positions_with_max_letter) # print(example_consensus_decoded)