PathoLM / utils.py
Sajib-006's picture
add files
28d5b6a verified
import pandas as pd
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
def stratified_sampling(df, sample_size=5000):
label_counts = df['label'].value_counts()
min_count = label_counts.min()
sample_size = min(sample_size, min_count)
sampled_df = df.groupby('label').apply(lambda x: x.sample(n=sample_size, random_state=42)).reset_index(drop=True)
return sampled_df
def fasta_to_df(fasta_file):
unique_ids = []
species = []
sequence_lengths = []
labels = []
fragment_ids = []
sequences = []
for record in SeqIO.parse(fasta_file, "fasta"):
unique_ids.append(record.description.split(' ')[0])
desc_parts = record.description.split(' ', 1)[1] if ' ' in record.description else ''
try:
desc_parts_dict = {part.split(':')[0].strip(): part.split(':')[1].strip() for part in desc_parts.split('|')}
except Exception as e:
print(f"Error parsing description for record {record.id}: {e}")
continue
species.append(desc_parts_dict.get('species'))
sequence_lengths.append(int(desc_parts_dict.get('sequence_length', 0)))
labels.append(desc_parts_dict.get('label'))
sequences.append(str(record.seq))
df = pd.DataFrame({
'unique_id': unique_ids,
'species': species,
'sequence_length': sequence_lengths,
'label': labels,
'sequence': sequences
})
return df