Spaces:

Jobey1
/

Convert-PDF-To-Parquet-With-paragraph-markers

Sleeping

App Files Files Community

Jobey1 commited on Feb 26

Commit

9433534

verified ·

1 Parent(s): b3bb65b

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -6

app.py CHANGED Viewed

@@ -28,6 +28,7 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
     # Regex patterns for detection
     doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
     year_pattern = r'\b(19|20)\d{2}\b'
     reference_keywords = ['reference', 'bibliography', 'sources']
     financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
@@ -85,22 +86,20 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
                 elif any(keyword in text.lower() for keyword in reference_keywords):
                     references += text + " "
-                # Enhanced Table Detection
                 elif re.search(r"table\s*\d+", text, re.IGNORECASE):
-                    # Look for captions and adjacent text
                     content += f"<TABLE>{text}</TABLE>\n"
-                # Enhanced Figure Detection
                 elif re.search(r"figure\s*\d+", text, re.IGNORECASE):
-                    # Look for image bounding boxes
                     content += f"<FIGURE>{text}</FIGURE>\n"
                 # Equations (look for math symbols)
                 elif re.search(r"=|∑|√|±|×|π|μ|σ", text):
                     content += f"<EQUATION>{text}</EQUATION>\n"
-                # Code Blocks (detect indentation or keywords)
-                elif re.search(r"def |class |import |for |while |if |else", text):
                     content += f"<CODE>{text}</CODE>\n"
                 # Financial Metrics
@@ -123,6 +122,7 @@ def extract_full_paper_with_labels(pdf_path, progress=None):
         "content": content
     }
 def upload_with_progress(file_path, repo_id, token, progress):
     """
     Upload file to Hugging Face Dataset using upload_file() API method.

     # Regex patterns for detection
     doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
     year_pattern = r'\b(19|20)\d{2}\b'
+    code_pattern = r"(def\s+\w+\s*\(|class\s+\w+|import\s+\w+|for\s+\w+\s+in|if\s+\w+|while\s+\w+|try:|except|{|\}|;)"
     reference_keywords = ['reference', 'bibliography', 'sources']
     financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
                 elif any(keyword in text.lower() for keyword in reference_keywords):
                     references += text + " "
+                # Tables
                 elif re.search(r"table\s*\d+", text, re.IGNORECASE):
                     content += f"<TABLE>{text}</TABLE>\n"
+                # Figures
                 elif re.search(r"figure\s*\d+", text, re.IGNORECASE):
                     content += f"<FIGURE>{text}</FIGURE>\n"
                 # Equations (look for math symbols)
                 elif re.search(r"=|∑|√|±|×|π|μ|σ", text):
                     content += f"<EQUATION>{text}</EQUATION>\n"
+                # ✅ Improved Code Block Detection
+                elif re.search(code_pattern, text) and len(text.split()) <= 50:
                     content += f"<CODE>{text}</CODE>\n"
                 # Financial Metrics
         "content": content
     }
 def upload_with_progress(file_path, repo_id, token, progress):
     """
     Upload file to Hugging Face Dataset using upload_file() API method.