add files

Browse files

Files changed (7) hide show

.gitattributes +2 -0
LICENSE +21 -0
data/test.fasta +3 -0
eval_model.py +85 -0
output.log +1 -0
pathoLM.png +3 -0
utils.py +45 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/test.fasta filter=lfs diff=lfs merge=lfs -text
+pathoLM.png filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Sajib Acharjee Dip
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

data/test.fasta ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f8e8010c816aa6fae67ef6a53e35b2907a5b54e610e1dac7e9912dc20526b40
+size 11990791

eval_model.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import argparse
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from datasets import Dataset
+import pandas as pd
+import numpy as np
+from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, balanced_accuracy_score, roc_auc_score, confusion_matrix
+from utils import fasta_to_df
+def compute_metrics(logits, labels):
+    predictions = np.argmax(logits, axis=1)
+    labels = np.array(labels, dtype=int)
+    predictions = np.array(predictions, dtype=int)
+    acc = accuracy_score(labels, predictions)
+    f1 = f1_score(labels, predictions, average='weighted')
+    mcc = matthews_corrcoef(labels, predictions)
+    balanced_acc = balanced_accuracy_score(labels, predictions)
+    auc_roc = None
+    if len(np.unique(labels)) == 2:
+        probs = np.exp(logits[:, 1]) / np.sum(np.exp(logits), axis=1)
+        auc_roc = roc_auc_score(labels, probs)
+    cm = confusion_matrix(labels, predictions)
+    return {
+        'accuracy': acc,
+        'f1_score': f1,
+        'mcc': mcc,
+        'auc_roc': auc_roc,
+        'balanced_accuracy': balanced_acc,
+        'confusion_matrix': cm.tolist()
+    }
+def encode_sequence(sequence, tokenizer, max_length):
+    return tokenizer(sequence, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
+def evaluate(model_path, test_file=None, sequence=None):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = AutoModelForSequenceClassification.from_pretrained(model_path, ignore_mismatched_sizes=True).to(device)
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    if sequence:
+        inputs = encode_sequence(sequence, tokenizer, tokenizer.model_max_length)
+        with torch.no_grad():
+            outputs = model(**{k: v.to(device) for k, v in inputs.items()})
+        logits = outputs.logits.cpu().numpy()
+        print("Single Sequence Prediction:", np.argmax(logits, axis=1))
+        return
+    test_df = fasta_to_df(test_file)
+    label_map = {
+        'non-pathogen': 0,
+        'pathogen': 1
+    }
+    test_df['label'] = test_df['label'].str.lower().map(label_map)
+    dataset = Dataset.from_pandas(test_df)
+    dataset = dataset.map(lambda x: encode_sequence(x['sequence'], tokenizer, tokenizer.model_max_length), batched=True)
+    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)
+    logits_list, labels_list = [], []
+    model.eval()
+    with torch.no_grad():
+        for batch in dataloader:
+            inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
+            # print(type(batch['label']), batch['label'])
+            labels = np.array(batch['label'])
+            outputs = model(**inputs)
+            logits_list.append(outputs.logits.cpu().numpy())
+            labels_list.append(labels)
+    logits = np.concatenate(logits_list, axis=0)
+    labels = np.concatenate(labels_list, axis=0)
+    results = compute_metrics(logits, labels)
+    print("Evaluation Metrics:", results)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, required=True, help="Path to the fine-tuned model directory")
+    parser.add_argument("--test_file", type=str, help="Path to the test fasta file")
+    parser.add_argument("--sequence", type=str, help="Single DNA sequence to classify")
+    args = parser.parse_args()
+    evaluate(args.model_path, args.test_file, args.sequence)

output.log ADDED Viewed

	@@ -0,0 +1 @@


1	+ Evaluation Metrics: {'accuracy': 0.6747420367877972, 'f1_score': 0.6143408107901266, 'mcc': 0.3983982161600102, 'auc_roc': 0.26562839560775575, 'balanced_accuracy': 0.6260618867430012, 'confusion_matrix': [[736, 2171], [4, 3776]]}

pathoLM.png ADDED Viewed

Git LFS Details

SHA256: bcb379dbbcd17e4a8c71af7a36f26b32398bf4af511043f0dffbab5fef36ee0e
Pointer size: 131 Bytes
Size of remote file: 312 kB

utils.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import pandas as pd
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio import SeqIO
+def stratified_sampling(df, sample_size=5000):
+    label_counts = df['label'].value_counts()
+    min_count = label_counts.min()
+    sample_size = min(sample_size, min_count)
+    sampled_df = df.groupby('label').apply(lambda x: x.sample(n=sample_size, random_state=42)).reset_index(drop=True)
+    return sampled_df
+def fasta_to_df(fasta_file):
+    unique_ids = []
+    species = []
+    sequence_lengths = []
+    labels = []
+    fragment_ids = []
+    sequences = []
+    for record in SeqIO.parse(fasta_file, "fasta"):
+        unique_ids.append(record.description.split(' ')[0])
+        desc_parts = record.description.split(' ', 1)[1] if ' ' in record.description else ''
+        try:
+            desc_parts_dict = {part.split(':')[0].strip(): part.split(':')[1].strip() for part in desc_parts.split('|')}
+        except Exception as e:
+            print(f"Error parsing description for record {record.id}: {e}")
+            continue
+        species.append(desc_parts_dict.get('species'))
+        sequence_lengths.append(int(desc_parts_dict.get('sequence_length', 0)))
+        labels.append(desc_parts_dict.get('label'))
+        sequences.append(str(record.seq))
+    df = pd.DataFrame({
+        'unique_id': unique_ids,
+        'species': species,
+        'sequence_length': sequence_lengths,
+        'label': labels,
+        'sequence': sequences
+    })
+    return df