Spaces:

Jobey1
/

Convert-PDF-To-Parquet-With-paragraph-markers

Sleeping

App Files Files Community

Jobey1 commited on Feb 26

Commit

b3bb65b

verified ·

1 Parent(s): dfa54c4

Update app.py

Browse files

Added more robust label detection.

Files changed (1) hide show

app.py +97 -24

app.py CHANGED Viewed

@@ -2,19 +2,35 @@ import gradio as gr
 import pandas as pd
 import fitz  # PyMuPDF
 import os
 from huggingface_hub import HfApi
 from huggingface_hub.utils import HfHubHTTPError
 import time
-def extract_paragraphs_with_headers(pdf_path, progress=None):
     print(f"📄 Starting PDF Processing: {os.path.basename(pdf_path)}")
     doc = fitz.open(pdf_path)
-    data = []
     total_pages = len(doc)
     max_iterations = total_pages * 2  # To prevent infinite loops
     iteration_count = 0
     for page_num, page in enumerate(doc):
         iteration_count += 1
         if iteration_count > max_iterations:
@@ -27,23 +43,85 @@ def extract_paragraphs_with_headers(pdf_path, progress=None):
         for block in blocks:
             if "lines" in block:
                 text = ""
                 for line in block["lines"]:
                     for span in line["spans"]:
                         text += span["text"] + " "
                 text = text.strip()
-                # Detect headers based on font size
-                is_header = any(span["size"] > 15 for line in block["lines"] for span in line["spans"])
-                data.append({
-                    "page_num": page_num + 1,
-                    "text": text,
-                    "is_header": is_header
-                })
     print(f"✅ Finished Processing PDF: {os.path.basename(pdf_path)}")
-    return data
 def upload_with_progress(file_path, repo_id, token, progress):
     """
@@ -87,20 +165,14 @@ def pdf_to_parquet_and_upload(pdf_files, hf_token, dataset_repo_id, action_choic
         if progress is not None:
             progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
-        # ✅ Step 1: Process PDF
-        extracted_data = extract_paragraphs_with_headers(pdf_file.name, progress=progress)
-        for item in extracted_data:
-            all_data.append({
-                'filename': os.path.basename(pdf_file.name),
-                'page_num': item['page_num'],
-                'text': item['text'],
-                'is_header': item['is_header']
-            })
     print("🟡 Converting Processed Data to Parquet")
     # ✅ Step 2: Convert to Parquet
     df = pd.DataFrame(all_data)
-    parquet_file = 'papers_with_headers.parquet'
     try:
         df.to_parquet(parquet_file, engine='pyarrow', index=False)
@@ -135,8 +207,9 @@ iface = gr.Interface(
         gr.File(label="Download Parquet File"),
         gr.Textbox(label="Status")
     ],
-    title="PDF to Parquet Converter with Correct Upload API",
-    description="Upload your PDFs, convert them to Parquet, and upload to your Hugging Face Dataset using the official API."
 )
 iface.launch()

 import pandas as pd
 import fitz  # PyMuPDF
 import os
+import re
 from huggingface_hub import HfApi
 from huggingface_hub.utils import HfHubHTTPError
 import time
+def extract_full_paper_with_labels(pdf_path, progress=None):
     print(f"📄 Starting PDF Processing: {os.path.basename(pdf_path)}")
     doc = fitz.open(pdf_path)
+    content = ""
+    # Initialize metadata
+    title = ""
+    authors = ""
+    year = ""
+    doi = ""
+    abstract = ""
+    footnotes = ""
+    references = ""
+    sources = ""
     total_pages = len(doc)
     max_iterations = total_pages * 2  # To prevent infinite loops
     iteration_count = 0
+    # Regex patterns for detection
+    doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
+    year_pattern = r'\b(19|20)\d{2}\b'
+    reference_keywords = ['reference', 'bibliography', 'sources']
+    financial_keywords = ['p/e', 'volatility', 'market cap', 'roi', 'sharpe', 'drawdown']
     for page_num, page in enumerate(doc):
         iteration_count += 1
         if iteration_count > max_iterations:
         for block in blocks:
             if "lines" in block:
                 text = ""
+                max_font_size = 0
                 for line in block["lines"]:
                     for span in line["spans"]:
                         text += span["text"] + " "
+                        if span["size"] > max_font_size:
+                            max_font_size = span["size"]
                 text = text.strip()
+                # Title (First Page, Largest Font)
+                if page_num == 0 and max_font_size > 15 and not title:
+                    title = text
+                    content += f"<TITLE>{title}</TITLE>\n"
+                # Authors
+                elif re.search(r'author|by', text, re.IGNORECASE) and not authors:
+                    authors = text
+                    content += f"<AUTHORS>{authors}</AUTHORS>\n"
+                # Year
+                elif re.search(year_pattern, text) and not year:
+                    year = re.search(year_pattern, text).group(0)
+                    content += f"<YEAR>{year}</YEAR>\n"
+                # DOI
+                elif re.search(doi_pattern, text) and not doi:
+                    doi = re.search(doi_pattern, text).group(0)
+                    content += f"<DOI>{doi}</DOI>\n"
+                # Abstract
+                elif "abstract" in text.lower() and not abstract:
+                    abstract = text
+                    content += f"<ABSTRACT>{abstract}</ABSTRACT>\n"
+                # Footnotes (small fonts)
+                elif max_font_size < 10:
+                    footnotes += text + " "
+                # References
+                elif any(keyword in text.lower() for keyword in reference_keywords):
+                    references += text + " "
+                # Enhanced Table Detection
+                elif re.search(r"table\s*\d+", text, re.IGNORECASE):
+                    # Look for captions and adjacent text
+                    content += f"<TABLE>{text}</TABLE>\n"
+                # Enhanced Figure Detection
+                elif re.search(r"figure\s*\d+", text, re.IGNORECASE):
+                    # Look for image bounding boxes
+                    content += f"<FIGURE>{text}</FIGURE>\n"
+                # Equations (look for math symbols)
+                elif re.search(r"=|∑|√|±|×|π|μ|σ", text):
+                    content += f"<EQUATION>{text}</EQUATION>\n"
+                # Code Blocks (detect indentation or keywords)
+                elif re.search(r"def |class |import |for |while |if |else", text):
+                    content += f"<CODE>{text}</CODE>\n"
+                # Financial Metrics
+                elif any(fin_kw in text.lower() for fin_kw in financial_keywords):
+                    content += f"<FINANCIAL_METRIC>{text}</FINANCIAL_METRIC>\n"
+                # Regular Paragraph
+                else:
+                    content += f"<PARAGRAPH>{text}</PARAGRAPH>\n"
+    # Append Footnotes and References
+    if footnotes:
+        content += f"<FOOTNOTE>{footnotes.strip()}</FOOTNOTE>\n"
+    if references:
+        content += f"<REFERENCE>{references.strip()}</REFERENCE>\n"
     print(f"✅ Finished Processing PDF: {os.path.basename(pdf_path)}")
+    return {
+        "filename": os.path.basename(pdf_path),
+        "content": content
+    }
 def upload_with_progress(file_path, repo_id, token, progress):
     """
         if progress is not None:
             progress(idx / total_files, desc=f"Processing File {idx + 1}/{total_files}")
+        # ✅ Step 1: Process PDF with Full Labels
+        extracted_data = extract_full_paper_with_labels(pdf_file.name, progress=progress)
+        all_data.append(extracted_data)
     print("🟡 Converting Processed Data to Parquet")
     # ✅ Step 2: Convert to Parquet
     df = pd.DataFrame(all_data)
+    parquet_file = 'fully_labeled_papers.parquet'
     try:
         df.to_parquet(parquet_file, engine='pyarrow', index=False)
         gr.File(label="Download Parquet File"),
         gr.Textbox(label="Status")
     ],
+    title="PDF to Parquet Converter with Full Labeling",
+    description="Upload your PDFs, convert them to Parquet with full section labeling, and upload to your Hugging Face Dataset."
 )
 iface.launch()