Update app.py
Browse files
app.py
CHANGED
@@ -28,6 +28,7 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
|
|
28 |
# Regex patterns for detection
|
29 |
doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
|
30 |
year_pattern = r'\b(19|20)\d{2}\b'
|
|
|
31 |
reference_keywords = ['reference', 'bibliography', 'sources']
|
32 |
financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
|
33 |
|
@@ -85,22 +86,20 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
|
|
85 |
elif any(keyword in text.lower() for keyword in reference_keywords):
|
86 |
references += text + " "
|
87 |
|
88 |
-
#
|
89 |
elif re.search(r"table\s*\d+", text, re.IGNORECASE):
|
90 |
-
# Look for captions and adjacent text
|
91 |
content += f"<TABLE>{text}</TABLE>\n"
|
92 |
|
93 |
-
#
|
94 |
elif re.search(r"figure\s*\d+", text, re.IGNORECASE):
|
95 |
-
# Look for image bounding boxes
|
96 |
content += f"<FIGURE>{text}</FIGURE>\n"
|
97 |
|
98 |
# Equations (look for math symbols)
|
99 |
elif re.search(r"=|∑|√|±|×|π|μ|σ", text):
|
100 |
content += f"<EQUATION>{text}</EQUATION>\n"
|
101 |
|
102 |
-
#
|
103 |
-
elif re.search(
|
104 |
content += f"<CODE>{text}</CODE>\n"
|
105 |
|
106 |
# Financial Metrics
|
@@ -123,6 +122,7 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
|
|
123 |
"content": content
|
124 |
}
|
125 |
|
|
|
126 |
def upload_with_progress(file_path, repo_id, token, progress):
|
127 |
"""
|
128 |
Upload file to Hugging Face Dataset using upload_file() API method.
|
|
|
28 |
# Regex patterns for detection
|
29 |
doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
|
30 |
year_pattern = r'\b(19|20)\d{2}\b'
|
31 |
+
code_pattern = r"(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|for\s+\w+\s+in|if\s+\w+|while\s+\w+|try:|except|{|\}|;)"
|
32 |
reference_keywords = ['reference', 'bibliography', 'sources']
|
33 |
financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
|
34 |
|
|
|
86 |
elif any(keyword in text.lower() for keyword in reference_keywords):
|
87 |
references += text + " "
|
88 |
|
89 |
+
# Tables
|
90 |
elif re.search(r"table\s*\d+", text, re.IGNORECASE):
|
|
|
91 |
content += f"<TABLE>{text}</TABLE>\n"
|
92 |
|
93 |
+
# Figures
|
94 |
elif re.search(r"figure\s*\d+", text, re.IGNORECASE):
|
|
|
95 |
content += f"<FIGURE>{text}</FIGURE>\n"
|
96 |
|
97 |
# Equations (look for math symbols)
|
98 |
elif re.search(r"=|∑|√|±|×|π|μ|σ", text):
|
99 |
content += f"<EQUATION>{text}</EQUATION>\n"
|
100 |
|
101 |
+
# ✅ Improved Code Block Detection
|
102 |
+
elif re.search(code_pattern, text) and len(text.split()) <= 50:
|
103 |
content += f"<CODE>{text}</CODE>\n"
|
104 |
|
105 |
# Financial Metrics
|
|
|
122 |
"content": content
|
123 |
}
|
124 |
|
125 |
+
|
126 |
def upload_with_progress(file_path, repo_id, token, progress):
|
127 |
"""
|
128 |
Upload file to Hugging Face Dataset using upload_file() API method.
|