# This file is part of SIMPLICITY
# Copyright (C) 2025 Pietro Gerletti
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
# -*- coding: utf-8 -*-
import simplicity.phenotype.weight as w
import simplicity.evolution.reference as ref
import numpy as np
reference = ref.get_reference()
[docs]
def get_seq_weights(data,t_sim):
"""
preprocess lineage data to calculate weighted consensus.
calculate weight for sequence at time t_sim
Parameters
----------
data : list
list of list containing lineages data.
data contains lineage_name, sequence, n_infected_t, t
t_sim : float
time of the simulation at which we evalute the weights.
Returns
-------
seq_for_consensus : list
[sequence, #_infected_t, w_t(t_sim)].
"""
# get parameters for w_t(t_sim)
params = w.w_t_params()
k_e = params[0]
k_a = params[1]
seq_for_consensus = []
# loop over data, compute weight and add entry to list
for lst in data:
seq_for_consensus.append([
lst[0],
lst[1],
w.weights(lst[2],t_sim, k_e, k_a, w.t_max)
])
return seq_for_consensus
[docs]
def build_weighted_consensus_matrix(data):
'''
Parameters
----------
data : TYPE
DESCRIPTION.
Returns
-------
matrix : TYPE
DESCRIPTION.
bases : TYPE
DESCRIPTION.
unique_positions : TYPE
DESCRIPTION.
'''
# Extract unique positions
unique_positions= sorted(set(num for entry in data for num, _ in entry[0]))
num_index = {num: i for i, num in enumerate(unique_positions)}
bases = ['A', 'T', 'C', 'G']
letter_index = {letter: i for i, letter in enumerate(bases)}
# Initialize the matrix
matrix = np.zeros((len(bases), len(unique_positions)), dtype=float)
# Populate the matrix, accounting for missing positions
for seq, n_infected, weight in data:
# Determine missing positions
existing_positions = set(num for num, _ in seq)
missing_positions = set(unique_positions) - existing_positions
# Update matrix for existing positions
for num, char in seq:
row = letter_index[char]
col = num_index[num]
matrix[row, col] += n_infected * weight
# Update matrix for missing positions
for num in missing_positions:
char = reference[num] # fetch base from reference genome
row = letter_index[char]
col = num_index[num]
matrix[row, col] += n_infected * weight
return matrix, bases, unique_positions
[docs]
def weighted_consensus(matrix, positions):
'''
calculate the weighted consensus sequence between individuals in the population
'''
bases = ['A', 'T', 'C', 'G']
consensus = []
# Iterate through each column in the matrix to find the base with the highest count
for col_idx, pos in enumerate(positions):
# Use argmax to find the index of the maximum count in the column
max_base_idx = np.argmax(matrix[:, col_idx])
# Use the index to get the corresponding letter
max_base= bases[max_base_idx]
# if position is diffrent from wt, append it. (we encode sequences as only positions that differ from wt)
if max_base != reference[pos]:
# Append the position (num) and the letter with the highest count
consensus.append([pos, max_base])
return consensus
[docs]
def get_consensus(data,t):
data = get_seq_weights(data,t)
matrix, bases, positions= build_weighted_consensus_matrix(data)
return weighted_consensus(matrix, positions)
# ## example use
# data = [
# [[[10,'A'],[45,'G']],10,10],
# [[[13,'A']], 5,10],
# [[[10,'C'],[45,'G']],10,10]
# ]
# # data contains sequence, n_infected_t, t
# data = get_seq_weights(data,100)
# # Build the ndarray
# matrix, bases, positions= build_weighted_consensus_matrix(data)
# # Call the function with the matrix and unique numbers
# positions_with_max_letter = weighted_consensus(matrix, positions)
# # # To display the result similarly to a pandas DataFrame for visualization:
# # import pandas as pd
# # df = pd.DataFrame(matrix, index=bases, columns=positions)
# # print(df)
# # Display the result
# print(positions_with_max_letter)
# import simplicity.evolution.decoder as dec
# example_consensus_decoded = dec.decode_genome(positions_with_max_letter)
# print(example_consensus_decoded)