Spaces:

SmartFlowAI
/

HuggingFaceWeeklyPaper

Running

App Files Files Community

HowardZhangdqs commited on Feb 9

Commit

ac02643

1 Parent(s): f432512

fix: label cache error

Browse files

Files changed (2) hide show

ai/classify_paper.py +30 -14
fetch_paper.py +6 -5

ai/classify_paper.py CHANGED Viewed

@@ -1,15 +1,19 @@
 import time
 from typing import List, Dict, Optional
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from threading import Lock
 from typing import List, Dict, Any
 import json
 try:
     from .ai import complete
 except ImportError:
     from ai import complete
 paper_types: Dict[str, str] = {
     "CV": "computer vision, any paper that deals with image, video, point cloud or 3D model data",
@@ -41,19 +45,21 @@ You should output in the following format with a code block:
 ```json
 [
     {
-        "index": 1,
         "category": ["RO"]
     },
     {
-        "index": 2,
         "category": ["ML"]
     },
     {
-        "index": 3,
         "category": ["LLM", "NLP"]
     }
 ]
 ```
 """.strip(),
     """
 The followings are the papers you need to classify:
@@ -61,24 +67,25 @@ The followings are the papers you need to classify:
 ])
-def build_paper(index: int, title: str, abstract: str = None) -> str:
     if abstract is None:
-        return f"{index}. {title}"
-    return f"{index}. {title}\n\n{abstract}"
 def get_classify_prompt(papers: List[Dict[str, str]]) -> str:
     prompt = []
     for index, paper in enumerate(papers, start=1):
-        prompt.append(build_paper(paper["index"] if "index" in paper else index, paper["title"], paper["abstract"] if "abstract" in paper else None))
     return user_prompt + "\n\n" + "\n\n".join(prompt)
 def parse_response(response: str) -> List[Dict[str, List[str]]] | None:
     # 匹配code block
     response = response.strip()
     if not response.startswith("```") or not response.endswith("```"):
@@ -90,13 +97,14 @@ def parse_response(response: str) -> List[Dict[str, List[str]]] | None:
     try:
         data = json.loads(response)
     except json.JSONDecodeError:
         return None
     for paper in data:
-        if "index" not in paper or "category" not in paper:
             return None
-        if not isinstance(paper["index"], int) or not isinstance(paper["category"], list):
             return None
         for category in paper["category"]:
@@ -160,9 +168,15 @@ class PaperCache:
     def get(self, paper):
         key = paper["id"]
         with self.lock:
-            return self.cache.get(key)
     def set(self, paper, result):
         key = paper["id"]
         with self.lock:
             self.cache[key] = result
@@ -212,12 +226,14 @@ def classify_papers(papers: List[Dict[str, str]]) -> Optional[List[Dict[str, Lis
                     for f in futures:
                         f.cancel()
                     return None
-                for paper, result in zip(uncached_papers, batch_result):
-                    paper_cache.set(paper, result)
                 results.extend(batch_result)
-            results.sort(key=lambda x: x['index'])
             return cached_results + results
-    except Exception:
         return None

+import traceback
 import time
 from typing import List, Dict, Optional
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from threading import Lock
 from typing import List, Dict, Any
 import json
+from rich.console import Console
 try:
     from .ai import complete
 except ImportError:
     from ai import complete
+print = Console().log
 paper_types: Dict[str, str] = {
     "CV": "computer vision, any paper that deals with image, video, point cloud or 3D model data",
 ```json
 [
     {
+        "id": "2402.01032",
         "category": ["RO"]
     },
     {
+        "id": "2402.03254",
         "category": ["ML"]
     },
     {
+        "id": "2403.00043",
         "category": ["LLM", "NLP"]
     }
 ]
 ```
+Do not add any additional information in the output. The order of the papers in the output should match the order of the papers in the input.
 """.strip(),
     """
 The followings are the papers you need to classify:
 ])
+def build_paper(id: str, title: str, abstract: str = None) -> str:
     if abstract is None:
+        return f"{id}: {title}"
+    return f"{id}: {title}\n\n{abstract}"
 def get_classify_prompt(papers: List[Dict[str, str]]) -> str:
     prompt = []
     for index, paper in enumerate(papers, start=1):
+        prompt.append(build_paper(paper["id"], paper["title"], paper["abstract"] if "abstract" in paper else None))
     return user_prompt + "\n\n" + "\n\n".join(prompt)
 def parse_response(response: str) -> List[Dict[str, List[str]]] | None:
+    print(response)
     # 匹配code block
     response = response.strip()
     if not response.startswith("```") or not response.endswith("```"):
     try:
         data = json.loads(response)
     except json.JSONDecodeError:
+        print(response)
         return None
     for paper in data:
+        if "id" not in paper or "category" not in paper:
             return None
+        if not isinstance(paper["id"], str) or not isinstance(paper["category"], list):
             return None
         for category in paper["category"]:
     def get(self, paper):
         key = paper["id"]
         with self.lock:
+            data = self.cache.get(key)
+            if data is not None:
+                print(f"Cache hit for {paper['id']}")
+                return data
+        print(f"Cache miss for {paper['id']}")
+        return None
     def set(self, paper, result):
+        print(f"Setting cache for {paper['id']}")
         key = paper["id"]
         with self.lock:
             self.cache[key] = result
                     for f in futures:
                         f.cancel()
                     return None
                 results.extend(batch_result)
+            print(results)
+            results.sort(key=lambda x: x['id'])
+            for result in results:
+                paper_cache.set(result, result)
             return cached_results + results
+    except Exception as e:
+        print(traceback.format_exc())
         return None

fetch_paper.py CHANGED Viewed

@@ -101,20 +101,21 @@ def fetch_papers_with_daterange(start_date: Date, end_date: Date):
     print(f"Unique articles: {len(unique_articles)}")
-    unique_articles: List[Article] = list(unique_articles.values())
     preprocessed_articles = list(map(lambda article: {
         "title": article.title,
         "abstract": article.paper.summary,
         "id": article.paper.id
-    }, unique_articles))
     classified_articles = classify_papers(preprocessed_articles)
-    for i, article in enumerate(unique_articles):
-        article.paper.label = classified_articles[i]["category"]
-    return unique_articles
 if __name__ == "__main__":

     print(f"Unique articles: {len(unique_articles)}")
+    preprocessed_articles: List[Article] = list(unique_articles.values())
     preprocessed_articles = list(map(lambda article: {
         "title": article.title,
         "abstract": article.paper.summary,
         "id": article.paper.id
+    }, preprocessed_articles))
     classified_articles = classify_papers(preprocessed_articles)
+    # 遍历 classified_articles，将分类结果写入到 unique_articles 中
+    for article in classified_articles:
+        unique_articles[article["id"]].paper.label = article["category"]
+    return list(unique_articles.values())
 if __name__ == "__main__":