Source code for epbd_bert.utility.data_utils
import torch
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
import epbd_bert.utility.pickle_utils as pickle_utils
# reading and processing peakfiles metadata to generate the labels dict
[docs]
def get_uniform_peaks_metadata(home_dir="/usr/projects/pyDNA_EPBD/tf_dna_binding/"):
col_names = [
"filename",
"project",
"lab",
"composite",
"dataType",
"view",
"cell",
"treatment",
"antibody",
"control",
"dataVersion",
"dccAccession",
"controlId",
"quality",
"tableName",
"type",
"md5sum",
"size",
]
with open(home_dir + "data/downloads/wgEncodeAwgTfbsUniform/files.txt", "r") as h:
data = []
for line in h.readlines():
# print(line.strip().split(";"))
line_items = line.strip().split(";")
row = {}
filename, project = line_items[0].split("\t")
project = project.split("=")[1]
# print(filename, project)
row["filename"] = filename
row["project"] = project
for line_item in line_items[1:]:
key, value = line_item.strip().split("=")
row[key] = value
data.append(row)
# print(row)
# break
peaks_metadata_df = pd.DataFrame.from_records(data)
print(peaks_metadata_df.shape)
print(peaks_metadata_df.columns)
return peaks_metadata_df
[docs]
def compute_multi_class_weights(home_dir=""):
data_path = home_dir + "resources/train_val_test/peaks_with_labels_train.tsv.gz"
data_df = pd.read_csv(data_path, compression="gzip", sep="\t")
labels_dict = pickle_utils.load(home_dir + "resources/processed_data/peakfilename_index_dict.pkl")
all_labels = []
def get_all_labels(labels):
for l in labels.split(","):
l = l.strip()
all_labels.append(labels_dict[l])
data_df["labels"].apply(get_all_labels)
class_weights = compute_class_weight("balanced", classes=np.array(list(range(len(labels_dict)))), y=all_labels)
class_weights = torch.tensor(class_weights, dtype=torch.float)
# print(class_weights)
return class_weights
# compute_multi_class_weights()