Source code for epbd_bert.datasets.sequence_epbd_multimodal_labelspecific_dataset
import transformers
from epbd_bert.datasets.sequence_epbd_multimodal_dataset import (
SequenceEPBDMultiModalDataset,
)
[docs]
class SequenceEPBDMultiModalLabelSpecificDataset(SequenceEPBDMultiModalDataset):
"""Dataset for multi-modal transformer"""
def __init__(
self,
data_path: str,
pydnaepbd_features_path: str,
tokenizer: transformers.PreTrainedTokenizer,
label="wgEncodeAwgTfbsBroadDnd41CtcfUniPk",
home_dir="",
):
super().__init__(data_path, pydnaepbd_features_path, tokenizer, home_dir)
self.data_df = self.data_df[self.data_df["labels"].apply(lambda x: label in x)]
self.data_df.reset_index(drop=True, inplace=True)
# tokenizer = transformers.AutoTokenizer.from_pretrained(
# "resources/DNABERT-2-117M/",
# trust_remote_code=True,
# cache_dir="resources/cache/",
# )
# ds = SequenceEPBDMultiModalLabelSpecificDataset(
# data_path="resources/train_val_test/peaks_with_labels_test.tsv.gz",
# pydnaepbd_features_path="resources/pydnaepbd_things/coord_flips/id_seqs/", # ../data, resources
# tokenizer=tokenizer,
# label="wgEncodeAwgTfbsBroadDnd41CtcfUniPk",
# )
# print(ds.__len__())
# print(ds.__getitem__(100))
# to run
# python -m epbd_bert.datasets.sequence_epbd_multimodal_labelspecific_dataset