Source code for pydna_epbd.input_reader

import os
from pydna_epbd.configs import InputConfigs


[docs]def read_sequences_from_a_file(
    input_seqs_dir, filename_with_ext, is_first_col_id, flanks, outputs_dir
):
    """Read DNA sequences from one input file. This also creates output directories for saving simulation outputs.

    Args:
        input_seqs_dir (str): Directory that contains DNA sequences file.
        filename_with_ext (str): Filename with extension.
        is_first_col_id (bool): Whether or not the first column of the sequence containing file determines sequence id.
        flanks (str): Flanks are added at both sides of all input sequences.
        outputs_dir (str): The output directory for saving simulation outputs.

    Returns:
        list: A list of tuples containing all sequences in the input sequence file with seq-id and output directory.
        If seq-id is not present, 1-indexed incrementally generated seq-id will be attached.
        Format: [("seq_output_dir", "seq_id", "seq")]
    """
    sequences = []
    # flanks = ""#.join(["GC"]*13) # 26 GCs on both sides
    filepath = input_seqs_dir + filename_with_ext
    filename_wo_ext = filename_with_ext[:-4]

    seq_output_dir = f"{outputs_dir}{filename_wo_ext}/"
    os.makedirs(seq_output_dir, exist_ok=True)

    with open(filepath, "r") as f:
        for i, line in enumerate(f.readlines()):
            line_items = line.split()
            if is_first_col_id:
                seq_id, seq = line_items[0].strip(), line_items[1].strip()
            else:
                seq_id, seq = i + 1, line_items[0].strip()

            seq = flanks + seq + flanks
            # 1-based increamental id for sequences in a file
            sequences.append((seq_output_dir, seq_id, seq))
    return sequences


[docs]def read_all_sequences(input_seqs_dir, is_first_col_id, flanks, outputs_dir):
    """Read all sequences for the simulation.

    Args:
        input_seqs_dir (str): Directory that contains files containing DNA sequences.
        is_first_col_id (bool): Whether or not the first column of the sequence containing file determines sequence id.
        flanks (str): Flanks are added at both sides of all input sequences.
        outputs_dir (str): The output directory for saving simulation outputs.

    Returns:
        list: A list containing all sequences in the input directory.
    """
    all_seqs = []
    total_num_of_bps = 0
    for i, filename_with_ext in enumerate(os.listdir(input_seqs_dir)):
        seqs = read_sequences_from_a_file(
            input_seqs_dir, filename_with_ext, is_first_col_id, flanks, outputs_dir
        )
        all_seqs += seqs

        n_bps = len(seqs[0][2]) - (2 * len(flanks))
        total_num_of_bps += len(seqs) * n_bps
        # print(filename_with_ext, len(seqs), n_bps, total_num_of_bps)
        # if i==2: break

    print(f"total #-seqs: {len(all_seqs)}")
    print(f"total #-bps (w/o flanks): {total_num_of_bps}")
    # print(all_seqs[:3])
    return all_seqs


[docs]def read_configurations(configuration_filepath):
    """Read the configs from the input configuration file.

    Args:
        configuration_filepath (str): The configurations needed to run MCMC simulations.
            The structure of the configs are given in the Readme file.

    Returns:
        InputConfigs: The class containing specific format for running the simulations.
    """

    # parsing configuration file
    configs = {}
    with open(configuration_filepath, "r") as myfile:
        for line in myfile:
            name, val = line.partition("=")[::2]
            configs[name.strip()] = val.strip()

    # global simulation configs
    is_first_col_id = configs["IsFirstColumnId"] == "Yes"
    save_full = configs["SaveFull"] == "Yes"
    save_runtime = configs["SaveRuntime"] == "Yes"
    input_seqs_dir = configs["SequencesDir"]
    outputs_dir = configs["OutputsDir"]
    flanks = "" if configs["Flanks"] == "None" else configs["Flanks"]
    temperature = float(configs["Temperature"])
    n_iterations = int(configs["Iterations"])
    n_preheating_steps = int(configs["PreheatingSteps"])
    n_post_preheating_steps = int(configs["PostPreheatingSteps"])
    n_nodes = int(configs["ComputingNodes"])

    # monitor configs
    os.environ["BUBBLE_MONITOR"] = configs["BubbleMonitor"]
    os.environ["COORD_MONITOR"] = configs["CoordinateMonitor"]
    os.environ["FLIPPING_MONITOR_VERBOSE"] = configs["FlippingMonitorVerbose"]
    os.environ["FLIPPING_MONITOR"] = configs["FlippingMonitor"]
    os.environ["ENERGY_MONITOR"] = configs["EnergyMonitor"]
    os.environ["MELTING_AND_FRACTION_MONITOR"] = configs["MeltingAndFractionMonitor"]
    os.environ["MELTING_AND_FRACTION_MANY_MONITOR"] = configs[
        "MeltingAndFractionManyMonitor"
    ]

    # reading sequences and creating outputs directory
    sequences = read_all_sequences(input_seqs_dir, is_first_col_id, flanks, outputs_dir)

    input_configs = InputConfigs(
        temperature,
        sequences,
        outputs_dir,
        n_iterations,
        n_preheating_steps,
        n_post_preheating_steps,
        n_nodes,
        save_full,
        save_runtime,
    )

    print(input_configs)
    return input_configs


# read_configurations("../examples/configs.txt")