Source code for pydna_epbd.input_reader
import os
from pydna_epbd.configs import InputConfigs
[docs]def read_sequences_from_a_file(
input_seqs_dir, filename_with_ext, is_first_col_id, flanks, outputs_dir
):
"""Read DNA sequences from one input file. This also creates output directories for saving simulation outputs.
Args:
input_seqs_dir (str): Directory that contains DNA sequences file.
filename_with_ext (str): Filename with extension.
is_first_col_id (bool): Whether or not the first column of the sequence containing file determines sequence id.
flanks (str): Flanks are added at both sides of all input sequences.
outputs_dir (str): The output directory for saving simulation outputs.
Returns:
list: A list of tuples containing all sequences in the input sequence file with seq-id and output directory.
If seq-id is not present, 1-indexed incrementally generated seq-id will be attached.
Format: [("seq_output_dir", "seq_id", "seq")]
"""
sequences = []
# flanks = ""#.join(["GC"]*13) # 26 GCs on both sides
filepath = input_seqs_dir + filename_with_ext
filename_wo_ext = filename_with_ext[:-4]
seq_output_dir = f"{outputs_dir}{filename_wo_ext}/"
os.makedirs(seq_output_dir, exist_ok=True)
with open(filepath, "r") as f:
for i, line in enumerate(f.readlines()):
line_items = line.split()
if is_first_col_id:
seq_id, seq = line_items[0].strip(), line_items[1].strip()
else:
seq_id, seq = i + 1, line_items[0].strip()
seq = flanks + seq + flanks
# 1-based increamental id for sequences in a file
sequences.append((seq_output_dir, seq_id, seq))
return sequences
[docs]def read_all_sequences(input_seqs_dir, is_first_col_id, flanks, outputs_dir):
"""Read all sequences for the simulation.
Args:
input_seqs_dir (str): Directory that contains files containing DNA sequences.
is_first_col_id (bool): Whether or not the first column of the sequence containing file determines sequence id.
flanks (str): Flanks are added at both sides of all input sequences.
outputs_dir (str): The output directory for saving simulation outputs.
Returns:
list: A list containing all sequences in the input directory.
"""
all_seqs = []
total_num_of_bps = 0
for i, filename_with_ext in enumerate(os.listdir(input_seqs_dir)):
seqs = read_sequences_from_a_file(
input_seqs_dir, filename_with_ext, is_first_col_id, flanks, outputs_dir
)
all_seqs += seqs
n_bps = len(seqs[0][2]) - (2 * len(flanks))
total_num_of_bps += len(seqs) * n_bps
# print(filename_with_ext, len(seqs), n_bps, total_num_of_bps)
# if i==2: break
print(f"total #-seqs: {len(all_seqs)}")
print(f"total #-bps (w/o flanks): {total_num_of_bps}")
# print(all_seqs[:3])
return all_seqs
[docs]def read_configurations(configuration_filepath):
"""Read the configs from the input configuration file.
Args:
configuration_filepath (str): The configurations needed to run MCMC simulations.
The structure of the configs are given in the Readme file.
Returns:
InputConfigs: The class containing specific format for running the simulations.
"""
# parsing configuration file
configs = {}
with open(configuration_filepath, "r") as myfile:
for line in myfile:
name, val = line.partition("=")[::2]
configs[name.strip()] = val.strip()
# global simulation configs
is_first_col_id = configs["IsFirstColumnId"] == "Yes"
save_full = configs["SaveFull"] == "Yes"
save_runtime = configs["SaveRuntime"] == "Yes"
input_seqs_dir = configs["SequencesDir"]
outputs_dir = configs["OutputsDir"]
flanks = "" if configs["Flanks"] == "None" else configs["Flanks"]
temperature = float(configs["Temperature"])
n_iterations = int(configs["Iterations"])
n_preheating_steps = int(configs["PreheatingSteps"])
n_post_preheating_steps = int(configs["PostPreheatingSteps"])
n_nodes = int(configs["ComputingNodes"])
# monitor configs
os.environ["BUBBLE_MONITOR"] = configs["BubbleMonitor"]
os.environ["COORD_MONITOR"] = configs["CoordinateMonitor"]
os.environ["FLIPPING_MONITOR_VERBOSE"] = configs["FlippingMonitorVerbose"]
os.environ["FLIPPING_MONITOR"] = configs["FlippingMonitor"]
os.environ["ENERGY_MONITOR"] = configs["EnergyMonitor"]
os.environ["MELTING_AND_FRACTION_MONITOR"] = configs["MeltingAndFractionMonitor"]
os.environ["MELTING_AND_FRACTION_MANY_MONITOR"] = configs[
"MeltingAndFractionManyMonitor"
]
# reading sequences and creating outputs directory
sequences = read_all_sequences(input_seqs_dir, is_first_col_id, flanks, outputs_dir)
input_configs = InputConfigs(
temperature,
sequences,
outputs_dir,
n_iterations,
n_preheating_steps,
n_post_preheating_steps,
n_nodes,
save_full,
save_runtime,
)
print(input_configs)
return input_configs
# read_configurations("../examples/configs.txt")