Jobey1 commited on
Commit
9433534
·
verified ·
1 Parent(s): b3bb65b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -6
app.py CHANGED
@@ -28,6 +28,7 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
28
  # Regex patterns for detection
29
  doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
30
  year_pattern = r'\b(19|20)\d{2}\b'
 
31
  reference_keywords = ['reference', 'bibliography', 'sources']
32
  financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
33
 
@@ -85,22 +86,20 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
85
  elif any(keyword in text.lower() for keyword in reference_keywords):
86
  references += text + " "
87
 
88
- # Enhanced Table Detection
89
  elif re.search(r"table\s*\d+", text, re.IGNORECASE):
90
- # Look for captions and adjacent text
91
  content += f"<TABLE>{text}</TABLE>\n"
92
 
93
- # Enhanced Figure Detection
94
  elif re.search(r"figure\s*\d+", text, re.IGNORECASE):
95
- # Look for image bounding boxes
96
  content += f"<FIGURE>{text}</FIGURE>\n"
97
 
98
  # Equations (look for math symbols)
99
  elif re.search(r"=|∑|√|±|×|π|μ|σ", text):
100
  content += f"<EQUATION>{text}</EQUATION>\n"
101
 
102
- # Code Blocks (detect indentation or keywords)
103
- elif re.search(r"def |class |import |for |while |if |else", text):
104
  content += f"<CODE>{text}</CODE>\n"
105
 
106
  # Financial Metrics
@@ -123,6 +122,7 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
123
  "content": content
124
  }
125
 
 
126
  def upload_with_progress(file_path, repo_id, token, progress):
127
  """
128
  Upload file to Hugging Face Dataset using upload_file() API method.
 
28
  # Regex patterns for detection
29
  doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
30
  year_pattern = r'\b(19|20)\d{2}\b'
31
+ code_pattern = r"(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|for\s+\w+\s+in|if\s+\w+|while\s+\w+|try:|except|{|\}|;)"
32
  reference_keywords = ['reference', 'bibliography', 'sources']
33
  financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
34
 
 
86
  elif any(keyword in text.lower() for keyword in reference_keywords):
87
  references += text + " "
88
 
89
+ # Tables
90
  elif re.search(r"table\s*\d+", text, re.IGNORECASE):
 
91
  content += f"<TABLE>{text}</TABLE>\n"
92
 
93
+ # Figures
94
  elif re.search(r"figure\s*\d+", text, re.IGNORECASE):
 
95
  content += f"<FIGURE>{text}</FIGURE>\n"
96
 
97
  # Equations (look for math symbols)
98
  elif re.search(r"=|∑|√|±|×|π|μ|σ", text):
99
  content += f"<EQUATION>{text}</EQUATION>\n"
100
 
101
+ # Improved Code Block Detection
102
+ elif re.search(code_pattern, text) and len(text.split()) <= 50:
103
  content += f"<CODE>{text}</CODE>\n"
104
 
105
  # Financial Metrics
 
122
  "content": content
123
  }
124
 
125
+
126
  def upload_with_progress(file_path, repo_id, token, progress):
127
  """
128
  Upload file to Hugging Face Dataset using upload_file() API method.