Spaces:

afg1
/

pomBase-screener

Sleeping

App Files Files Community

Andrew Green commited on Dec 9, 2024

Commit

c23cd24

1 Parent(s): 0e4ad79

batch inference and add progressbar

Browse files

Files changed (1) hide show

app.py +56 -11

app.py CHANGED Viewed

@@ -23,7 +23,7 @@ def get_pipeline():
     model_name = "afg1/pombe_curation_fold_0"
-    pipe = pipeline(model=model_name)
     return pipe
@@ -31,16 +31,58 @@ def get_pipeline():
 @spaces.GPU
-def classify_abstracts(abstracts:Dict[str, str]) -> None:
     pipe = get_pipeline()
-    pmids = list(abstracts.keys())
-    classification = pipe(text=list(abstracts.values()))
-    for pmid, abs in zip(pmids, classification):
-        abs['label'] = label_lookup[abs['label']]
-        abs['pmid'] = pmid
-    return classification
@@ -122,9 +164,9 @@ def fetch_abstracts_batch(pmids: List[str], batch_size: int = 200) -> Dict[str,
                         # Simple abstract
                         abstract_text = abstract_element.text
                 else:
-                    abstract_text = "No abstract available"
-                all_abstracts[pmid] = abstract_text
             # Respect NCBI's rate limits
             time.sleep(0.34)
@@ -275,6 +317,9 @@ def create_interface():
         with gr.Row():
             d = gr.DownloadButton("Download results", visible=True, interactive=False)
         d.click(download_file, None, d)
         search_button.click(

     model_name = "afg1/pombe_curation_fold_0"
+    pipe = pipeline(model=model_name, task="text-classification")
     return pipe
 @spaces.GPU
+def classify_abstracts(abstracts:Dict[str, str],batch_size=64, progress=gr.Progress()) -> None:
     pipe = get_pipeline()
+    # pmids = list(abstracts.keys())
+    # batch_size = 64
+    # classification = []
+    # abstracts_list = list(abstracts.values())
+    # for i in range(0, len(abstracts), batch_size):
+    #     classification.extend(pipe(abstracts_list[i:i+batch_size]))
+    # for pmid, abs in zip(pmids, classification):
+    #     abs['label'] = label_lookup[abs['label']]
+    #     abs['pmid'] = pmid
+    # return classification
+    results = []
+    total = len(abstracts)
+    # Convert dictionary to lists of PMIDs and abstracts, preserving order
+    pmids = list(abstracts.keys())
+    abstract_texts = list(abstracts.values())
+    # Initialize progress bar
+    progress(0, desc="Starting classification...")
+    # Process in batches
+    for i in range(0, total, batch_size):
+        # Get current batch
+        batch_abstracts = abstract_texts[i:i + batch_size]
+        batch_pmids = pmids[i:i + batch_size]
+        try:
+            # Classify the batch
+            classifications = pipe(batch_abstracts)
+            # Process each result in the batch
+            for pmid, classification in zip(batch_pmids, classifications):
+                results.append({
+                    'pmid': pmid,
+                    'classification': classification['label'],
+                    'score': classification['score']
+                })
+            # Update progress
+            progress(min((i + batch_size) / total, 1.0),
+                    desc=f"Classified {min(i + batch_size, total)}/{total} abstracts...")
+        except Exception as e:
+            print(f"Error classifying batch starting at index {i}: {str(e)}")
+            continue
+    progress(1.0, desc="Classification complete!")
+    return results
                         # Simple abstract
                         abstract_text = abstract_element.text
                 else:
+                    abstract_text = ""
+                if len(abstract_text) > 0:
+                    all_abstracts[pmid] = abstract_text
             # Respect NCBI's rate limits
             time.sleep(0.34)
         with gr.Row():
             d = gr.DownloadButton("Download results", visible=True, interactive=False)
+        with gr.Row():
+            progress=gr.Progress()
         d.click(download_file, None, d)
         search_button.click(