Spaces:

AIAcceleratorLab
/

ocr

Sleeping

App Files Files Community

development-v1

by jayyai - opened Feb 24

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+66

-123

Files changed (2) hide show

pdf_route.py +64 -122
requirements.txt +2 -1

pdf_route.py CHANGED Viewed

@@ -34,7 +34,7 @@ async def convert_to_markdown(file: UploadFile = File(...)):
         # Analyze the document
         result = analyze_document(temp_pdf_path)
         # Create markdown file
         temp_md_path = "temp.md"
         create_markdown_file(result, temp_md_path)
@@ -71,13 +71,25 @@ async def convert_to_excel(file: UploadFile = File(...)):
     try:
         # Read the markdown content
         content = await file.read()
-        markdown_text = content.decode('utf-8')
-        # Extract tables from markdown
-        tables = extract_tables_from_markdown(markdown_text)
-        if not tables:
-            raise HTTPException(status_code=400, detail="No tables found in the markdown content")
         # Create Excel file
         excel_buffer = create_excel_from_markdown_tables(tables)
@@ -104,20 +116,27 @@ async def convert_to_word(file: UploadFile = File(...)):
         StreamingResponse: Word document file
     """
     try:
-        # Read the markdown content
         content = await file.read()
-        markdown_text = content.decode('utf-8')
-        # Create Word file
-        temp_docx_path = "temp.docx"
-        create_word_from_markdown(markdown_text, temp_docx_path)
-        # Read the Word file
-        with open(temp_docx_path, "rb") as f:
             word_content = f.read()
-        # Clean up temporary file
-        os.remove(temp_docx_path)
         # Return the Word file as a download
         return StreamingResponse(
@@ -134,6 +153,7 @@ async def convert_to_word(file: UploadFile = File(...)):
 def analyze_document(file_path):
     """Analyze document using Azure Form Recognizer"""
     endpoint = "https://aal-ocr-ai-azureapi.cognitiveservices.azure.com/"
     key = os.getenv("AZURE_FORM_RECOGNIZER_KEY")
     document_analysis_client = DocumentAnalysisClient(
@@ -144,7 +164,7 @@ def analyze_document(file_path):
         poller = document_analysis_client.begin_analyze_document(
             "prebuilt-layout", document=f
         )
     result = poller.result()
     return result
@@ -185,86 +205,16 @@ def create_excel_from_markdown_tables(tables):
     """Create Excel file from markdown tables"""
     excel_buffer = BytesIO()
-    with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
         for i, table in enumerate(tables):
-            if table:
-                # Convert table to DataFrame
-                df = pd.DataFrame(table[1:], columns=table[0])
-                # Save to Excel sheet
-                sheet_name = f"Table_{i+1}"
-                df.to_excel(writer, sheet_name=sheet_name, index=False)
     excel_buffer.seek(0)
     return excel_buffer
-def create_word_from_markdown(markdown_text, output_file):
-    """Create Word document from markdown text"""
-    doc = Document()
-    lines = markdown_text.split('\n')
-    current_table = []
-    in_table = False
-    for line in lines:
-        # Handle headers
-        if line.startswith('#'):
-            level = len(line.split()[0])  # Count the number of '#'
-            text = line.lstrip('#').strip()
-            doc.add_heading(text, level=min(level, 9))
-        # Handle tables
-        elif '|' in line:
-            # Skip separator lines
-            if re.match(r'^[\s|:-]+$', line):
-                continue
-            # Process table row
-            cells = [cell.strip() for cell in line.split('|')[1:-1]]
-            if cells:
-                if not in_table:
-                    in_table = True
-                    current_table = []
-                current_table.append(cells)
-        # Handle end of table
-        elif in_table:
-            if current_table:
-                table = doc.add_table(rows=len(current_table), cols=len(current_table[0]))
-                table.style = 'Table Grid'
-                for i, row in enumerate(current_table):
-                    for j, cell in enumerate(row):
-                        table.cell(i, j).text = cell
-                doc.add_paragraph()  # Add space after table
-            current_table = []
-            in_table = False
-        # Handle checkbox lists
-        elif line.strip().startswith('- ['):
-            p = doc.add_paragraph()
-            run = p.add_run()
-            if 'x' in line or 'X' in line:
-                run.add_text("☑ " + line[5:].strip())
-            else:
-                run.add_text("☐ " + line[5:].strip())
-        # Handle regular paragraphs
-        elif line.strip():
-            doc.add_paragraph(line.strip())
-    # Handle the last table if exists
-    if in_table and current_table:
-        table = doc.add_table(rows=len(current_table), cols=len(current_table[0]))
-        table.style = 'Table Grid'
-        for i, row in enumerate(current_table):
-            for j, cell in enumerate(row):
-                table.cell(i, j).text = cell
-    doc.save(output_file)
 def create_markdown_file(result, output_file):
     """Create markdown file from analysis result"""
     with open(output_file, 'w', encoding='utf-8') as md_file:
@@ -272,7 +222,7 @@ def create_markdown_file(result, output_file):
             # md_file.write(f"### Page {page.page_number}\n\n")
             elements = []
-            elements.extend([(paragraph.bounding_regions[0].polygon[0].y + paragraph.bounding_regions[0].polygon[0].x*0.05, 'paragraph', paragraph)
                            for paragraph in result.paragraphs if paragraph.bounding_regions[0].page_number == page.page_number])
             elements.sort(key=lambda x: x[0])
@@ -294,9 +244,8 @@ def create_markdown_file(result, output_file):
                 elements = [element for element in elements if element[2] != title_paragraph]
                 md_file.write(f"# {title_paragraph.content}\n\n")
-            elements.extend([(table.bounding_regions[0].polygon[0].y + table.bounding_regions[0].polygon[0].x*0.05, 'table', table)
                            for table in result.tables if table.bounding_regions[0].page_number == page.page_number])
-            elements.extend([(mark.polygon[0].y + mark.polygon[0].x*0.05, 'selection_mark', mark) for mark in page.selection_marks])
             elements.sort(key=lambda x: x[0])
@@ -305,7 +254,8 @@ def create_markdown_file(result, output_file):
                 if element_type == 'paragraph':
                     if any(is_element_inside_table(element, get_table_max_polygon(table)) for table in result.tables):
                         continue
-                    md_file.write(f"{element.content}\n\n")
                 elif element_type == 'table':
                     for row_idx in range(element.row_count):
@@ -314,18 +264,12 @@ def create_markdown_file(result, output_file):
                             cell_content = ""
                             for cell in element.cells:
                                 if cell.row_index == row_idx and cell.column_index == col_idx:
-                                    cell_content = cell.content
                                     table_cells.add((cell.bounding_regions[0].polygon[0].x, cell.bounding_regions[0].polygon[0].y))
                                     break
                             row_content += f"{cell_content} | "
                         md_file.write(row_content + "\n")
                     md_file.write("\n")
-                elif element_type == 'selection_mark':
-                    if element.state == "selected":
-                        md_file.write("- [x] \n\n")
-                    else:
-                        md_file.write("- [ ] \n\n")
 def create_word_file(result, output_file):
     """Create Word document from analysis result"""
@@ -334,6 +278,7 @@ def create_word_file(result, output_file):
     # Analyze pages
     for page in result.pages:
         # Combine paragraphs, tables, and selection marks in the order they appear on the page
         elements = []
         elements.extend([(paragraph.bounding_regions[0].polygon[0].y + paragraph.bounding_regions[0].polygon[0].x*0.01, 'paragraph', paragraph)
@@ -357,13 +302,13 @@ def create_word_file(result, output_file):
         if title_paragraph:
             elements = [element for element in elements if element[2] != title_paragraph]
-            doc.add_heading(title_paragraph.content, level=1)
         # Continuous combine paragraphs, tables, and selection marks in the order they appear on the page
         elements.extend([(table.bounding_regions[0].polygon[0].y + table.bounding_regions[0].polygon[0].x*0.01, 'table', table)
                         for table in result.tables if table.bounding_regions[0].page_number == page.page_number])
-        elements.extend([(mark.polygon[0].y + mark.polygon[0].x*0.01, 'selection_mark', mark)
-                        for mark in page.selection_marks])
         # Sort elements by the sum of their horizontal and vertical positions on the page
         elements.sort(key=lambda x: x[0])
@@ -375,7 +320,8 @@ def create_word_file(result, output_file):
                 # Skip lines that are part of a table
                 if any(is_element_inside_table(element, get_table_max_polygon(table)) for table in result.tables):
                     continue
-                doc.add_paragraph(element.content)
             elif element_type == 'table':
                 table = doc.add_table(rows=element.row_count, cols=element.column_count)
                 table.style = 'Table Grid'
@@ -385,18 +331,10 @@ def create_word_file(result, output_file):
                         cell_content = ""
                         for cell in element.cells:
                             if cell.row_index == row_idx and cell.column_index == col_idx:
-                                cell_content = cell.content
                                 table_cells.add((cell.bounding_regions[0].polygon[0].x, cell.bounding_regions[0].polygon[0].y))
                                 break
                         row_cells[col_idx].text = cell_content
-            elif element_type == 'selection_mark':
-                p = doc.add_paragraph()
-                run = p.add_run()
-                if element.state == "selected":
-                    run.add_text("☑ ")
-                else:
-                    run.add_text("☐ ")
     # Save Word document
     doc.save(output_file)
@@ -407,19 +345,23 @@ def format_polygon(polygon):
     return ", ".join([f"[{p.x}, {p.y}]" for p in polygon])
 def get_table_max_polygon(table):
-    """Get the maximum polygon coordinates for a table"""
     first_cell = table.cells[0]
     first_coordinate = first_cell.bounding_regions[0].polygon[0]
     last_cell = table.cells[-1]
-    last_coordinate = last_cell.bounding_regions[0].polygon[-1]
     return [first_coordinate, last_coordinate]
 def is_element_inside_table(element, table_max_polygon):
-    """Check if an element is inside a table"""
-    element_x = element.bounding_regions[0].polygon[0].x
-    element_y = element.bounding_regions[0].polygon[0].y
     first_coordinate = table_max_polygon[0]
-    last_coordinate = table_max_polygon[1]
     return (first_coordinate.x <= element_x <= last_coordinate.x and
-            first_coordinate.y <= element_y <= last_coordinate.y)

         # Analyze the document
         result = analyze_document(temp_pdf_path)
         # Create markdown file
         temp_md_path = "temp.md"
         create_markdown_file(result, temp_md_path)
     try:
         # Read the markdown content
         content = await file.read()
+        # Save the content to a temporary file
+        temp_pdf_path = "temp.pdf"
+        with open(temp_pdf_path, "wb") as f:
+            f.write(content)
+        # Analyze the document
+        result = analyze_document(temp_pdf_path)
+        tables = []
+        for table in result.tables:
+            table_data = []
+            for cell in table.cells:
+                table_data.append({
+                    "row_index": cell.row_index,
+                    "column_index": cell.column_index,
+                    "text": cell.content
+                })
+            tables.append(table_data)
         # Create Excel file
         excel_buffer = create_excel_from_markdown_tables(tables)
         StreamingResponse: Word document file
     """
     try:
+        # Read the uploaded file content
         content = await file.read()
+        # Save the content to a temporary file
+        temp_pdf_path = "temp.pdf"
+        with open(temp_pdf_path, "wb") as f:
+            f.write(content)
+        # Analyze the document
+        result = analyze_document(temp_pdf_path)
+        # Create word file
+        temp_word_path = "temp.docx"
+        create_word_file(result, temp_word_path)
+        # Read the word file
+        with open(temp_word_path, "rb") as f:
             word_content = f.read()
+        # Clean up temporary files
+        os.remove(temp_pdf_path)
+        os.remove(temp_word_path)
         # Return the Word file as a download
         return StreamingResponse(
 def analyze_document(file_path):
     """Analyze document using Azure Form Recognizer"""
     endpoint = "https://aal-ocr-ai-azureapi.cognitiveservices.azure.com/"
+    # endpoint = "https://zzaocrtool.cognitiveservices.azure.com/"
     key = os.getenv("AZURE_FORM_RECOGNIZER_KEY")
     document_analysis_client = DocumentAnalysisClient(
         poller = document_analysis_client.begin_analyze_document(
             "prebuilt-layout", document=f
         )
     result = poller.result()
     return result
     """Create Excel file from markdown tables"""
     excel_buffer = BytesIO()
+    with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
         for i, table in enumerate(tables):
+            df = pd.DataFrame(table)
+            df_pivot = df.pivot(index='row_index', columns='column_index', values='text')
+            sheet_name = f'Sheet{i+1}'
+            df_pivot.to_excel(writer, sheet_name=sheet_name, index=False)
     excel_buffer.seek(0)
     return excel_buffer
 def create_markdown_file(result, output_file):
     """Create markdown file from analysis result"""
     with open(output_file, 'w', encoding='utf-8') as md_file:
             # md_file.write(f"### Page {page.page_number}\n\n")
             elements = []
+            elements.extend([(paragraph.bounding_regions[0].polygon[0].y + paragraph.bounding_regions[0].polygon[0].x*0.01, 'paragraph', paragraph)
                            for paragraph in result.paragraphs if paragraph.bounding_regions[0].page_number == page.page_number])
             elements.sort(key=lambda x: x[0])
                 elements = [element for element in elements if element[2] != title_paragraph]
                 md_file.write(f"# {title_paragraph.content}\n\n")
+            elements.extend([(table.bounding_regions[0].polygon[0].y + table.bounding_regions[0].polygon[0].x*0.01, 'table', table)
                            for table in result.tables if table.bounding_regions[0].page_number == page.page_number])
             elements.sort(key=lambda x: x[0])
                 if element_type == 'paragraph':
                     if any(is_element_inside_table(element, get_table_max_polygon(table)) for table in result.tables):
                         continue
+                    content = element.content.replace(":selected:", "").replace(":unselected:", "")
+                    md_file.write(f"{content}\n\n")
                 elif element_type == 'table':
                     for row_idx in range(element.row_count):
                             cell_content = ""
                             for cell in element.cells:
                                 if cell.row_index == row_idx and cell.column_index == col_idx:
+                                    cell_content = cell.content.replace(":selected:", "").replace(":unselected:", "")
                                     table_cells.add((cell.bounding_regions[0].polygon[0].x, cell.bounding_regions[0].polygon[0].y))
                                     break
                             row_content += f"{cell_content} | "
                         md_file.write(row_content + "\n")
                     md_file.write("\n")
 def create_word_file(result, output_file):
     """Create Word document from analysis result"""
     # Analyze pages
     for page in result.pages:
+        doc.add_heading(f"File Page {page.page_number}", level=2)
         # Combine paragraphs, tables, and selection marks in the order they appear on the page
         elements = []
         elements.extend([(paragraph.bounding_regions[0].polygon[0].y + paragraph.bounding_regions[0].polygon[0].x*0.01, 'paragraph', paragraph)
         if title_paragraph:
             elements = [element for element in elements if element[2] != title_paragraph]
+        title = title_paragraph
+        doc.add_heading(title.content, level=1)
         # Continuous combine paragraphs, tables, and selection marks in the order they appear on the page
         elements.extend([(table.bounding_regions[0].polygon[0].y + table.bounding_regions[0].polygon[0].x*0.01, 'table', table)
                         for table in result.tables if table.bounding_regions[0].page_number == page.page_number])
         # Sort elements by the sum of their horizontal and vertical positions on the page
         elements.sort(key=lambda x: x[0])
                 # Skip lines that are part of a table
                 if any(is_element_inside_table(element, get_table_max_polygon(table)) for table in result.tables):
                     continue
+                content = element.content.replace(":selected:", "").replace(":unselected:", "")
+                doc.add_paragraph(content)
             elif element_type == 'table':
                 table = doc.add_table(rows=element.row_count, cols=element.column_count)
                 table.style = 'Table Grid'
                         cell_content = ""
                         for cell in element.cells:
                             if cell.row_index == row_idx and cell.column_index == col_idx:
+                                cell_content = cell.content.replace(":selected:", "").replace(":unselected:", "")
                                 table_cells.add((cell.bounding_regions[0].polygon[0].x, cell.bounding_regions[0].polygon[0].y))
                                 break
                         row_cells[col_idx].text = cell_content
     # Save Word document
     doc.save(output_file)
     return ", ".join([f"[{p.x}, {p.y}]" for p in polygon])
 def get_table_max_polygon(table):
+    # first coordination
     first_cell = table.cells[0]
     first_coordinate = first_cell.bounding_regions[0].polygon[0]
+    # last coordination
     last_cell = table.cells[-1]
+    last_coordinate = last_cell.bounding_regions[0].polygon[2]
+    # return max polygon
     return [first_coordinate, last_coordinate]
 def is_element_inside_table(element, table_max_polygon):
+    # midpoint of the cell is inside table
+    element_x = (element.bounding_regions[0].polygon[0].x + element.bounding_regions[0].polygon[2].x)/2
+    element_y = (element.bounding_regions[0].polygon[0].y + element.bounding_regions[0].polygon[2].y)/2
     first_coordinate = table_max_polygon[0]
+    last_coordinate = table_max_polygon[1]  # no.3 and no.4 coordination!!!! need help here correct error
     return (first_coordinate.x <= element_x <= last_coordinate.x and
+            first_coordinate.y <= element_y <= last_coordinate.y)

requirements.txt CHANGED Viewed

@@ -10,4 +10,5 @@ azure-ai-formrecognizer==3.3.0
 python-dotenv==1.0.0
 python-docx==1.1.0
 pandas==2.1.4
-openpyxl==3.1.5

 python-dotenv==1.0.0
 python-docx==1.1.0
 pandas==2.1.4
+openpyxl==3.1.5
+xlsxwriter==3.2.2