|
import pandas as pd |
|
from Bio.Seq import Seq |
|
from Bio.SeqRecord import SeqRecord |
|
from Bio import SeqIO |
|
|
|
def stratified_sampling(df, sample_size=5000): |
|
label_counts = df['label'].value_counts() |
|
min_count = label_counts.min() |
|
sample_size = min(sample_size, min_count) |
|
sampled_df = df.groupby('label').apply(lambda x: x.sample(n=sample_size, random_state=42)).reset_index(drop=True) |
|
return sampled_df |
|
|
|
def fasta_to_df(fasta_file): |
|
unique_ids = [] |
|
species = [] |
|
sequence_lengths = [] |
|
labels = [] |
|
fragment_ids = [] |
|
sequences = [] |
|
|
|
for record in SeqIO.parse(fasta_file, "fasta"): |
|
unique_ids.append(record.description.split(' ')[0]) |
|
|
|
desc_parts = record.description.split(' ', 1)[1] if ' ' in record.description else '' |
|
try: |
|
desc_parts_dict = {part.split(':')[0].strip(): part.split(':')[1].strip() for part in desc_parts.split('|')} |
|
except Exception as e: |
|
print(f"Error parsing description for record {record.id}: {e}") |
|
continue |
|
|
|
species.append(desc_parts_dict.get('species')) |
|
sequence_lengths.append(int(desc_parts_dict.get('sequence_length', 0))) |
|
labels.append(desc_parts_dict.get('label')) |
|
sequences.append(str(record.seq)) |
|
|
|
|
|
df = pd.DataFrame({ |
|
'unique_id': unique_ids, |
|
'species': species, |
|
'sequence_length': sequence_lengths, |
|
'label': labels, |
|
'sequence': sequences |
|
}) |
|
|
|
return df |