import pandas as pd from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio import SeqIO def stratified_sampling(df, sample_size=5000): label_counts = df['label'].value_counts() min_count = label_counts.min() sample_size = min(sample_size, min_count) sampled_df = df.groupby('label').apply(lambda x: x.sample(n=sample_size, random_state=42)).reset_index(drop=True) return sampled_df def fasta_to_df(fasta_file): unique_ids = [] species = [] sequence_lengths = [] labels = [] fragment_ids = [] sequences = [] for record in SeqIO.parse(fasta_file, "fasta"): unique_ids.append(record.description.split(' ')[0]) desc_parts = record.description.split(' ', 1)[1] if ' ' in record.description else '' try: desc_parts_dict = {part.split(':')[0].strip(): part.split(':')[1].strip() for part in desc_parts.split('|')} except Exception as e: print(f"Error parsing description for record {record.id}: {e}") continue species.append(desc_parts_dict.get('species')) sequence_lengths.append(int(desc_parts_dict.get('sequence_length', 0))) labels.append(desc_parts_dict.get('label')) sequences.append(str(record.seq)) df = pd.DataFrame({ 'unique_id': unique_ids, 'species': species, 'sequence_length': sequence_lengths, 'label': labels, 'sequence': sequences }) return df