Spaces:
Runtime error
Runtime error
Rivalcoder
commited on
Commit
Β·
7d0e6b0
1
Parent(s):
afd28fa
[Edit] Update of Image Data Handling
Browse files- Dockerfile +7 -0
- llm.py +8 -0
- pdf_parser.py +69 -33
- requirements.txt +2 -0
Dockerfile
CHANGED
|
@@ -5,6 +5,13 @@ WORKDIR /app
|
|
| 5 |
# Install system dependencies
|
| 6 |
RUN apt-get update && apt-get install -y \
|
| 7 |
build-essential \
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
|
| 10 |
# Create a non-root user
|
|
|
|
| 5 |
# Install system dependencies
|
| 6 |
RUN apt-get update && apt-get install -y \
|
| 7 |
build-essential \
|
| 8 |
+
tesseract-ocr \
|
| 9 |
+
libglib2.0-0 \
|
| 10 |
+
libsm6 \
|
| 11 |
+
libxext6 \
|
| 12 |
+
libxrender-dev \
|
| 13 |
+
poppler-utils \
|
| 14 |
+
&& apt-get clean \
|
| 15 |
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
|
| 17 |
# Create a non-root user
|
llm.py
CHANGED
|
@@ -30,6 +30,9 @@ You are an expert insurance assistant generating formal yet user-facing answers
|
|
| 30 |
- Limit each answer to 2β3 sentences, and do not repeat unnecessary information.
|
| 31 |
- If a question can be answered with a simple "Yes", "No", "Can apply", or "Cannot apply", then begin the answer with that phrase, followed by a short supporting Statement In Natural Human Like response.So Give A Good Answer For The Question With Correct Information.
|
| 32 |
- Avoid giving theory Based Long Long answers Try to Give Short Good Reasonable Answers.
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
π DO NOT:
|
| 35 |
- Use words like "context", "document", or "text".
|
|
@@ -37,12 +40,17 @@ You are an expert insurance assistant generating formal yet user-facing answers
|
|
| 37 |
- Say "helpful", "available", "allowed", "indemnified", "excluded", etc.
|
| 38 |
- Use overly robotic passive constructions like "shall be indemnified".
|
| 39 |
- Dont Give In Message Like "Based On The Context "Or "Nothing Refered In The context" Like That Dont Give In Response Try To Give Answer For The Question Alone
|
|
|
|
| 40 |
|
| 41 |
β
DO:
|
| 42 |
- Write in clean, informative language.
|
| 43 |
- Give complete answers in 2β3 sentences maximum.
|
| 44 |
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
|
| 48 |
π€ OUTPUT FORMAT (strict):
|
|
|
|
| 30 |
- Limit each answer to 2β3 sentences, and do not repeat unnecessary information.
|
| 31 |
- If a question can be answered with a simple "Yes", "No", "Can apply", or "Cannot apply", then begin the answer with that phrase, followed by a short supporting Statement In Natural Human Like response.So Give A Good Answer For The Question With Correct Information.
|
| 32 |
- Avoid giving theory Based Long Long answers Try to Give Short Good Reasonable Answers.
|
| 33 |
+
- Dont Give Long theory Like Response Very Large Response Just Give Short And Good Response For The Question.
|
| 34 |
+
- If the question is general (math, code, tech, etc.) and No Matches With Context, answer normally without referencing the document.
|
| 35 |
+
- Avoid Saying βNot foundβ or βOut of scopeβ For The Answer of The Question Try to Give Basic General Response For The Question.
|
| 36 |
|
| 37 |
π DO NOT:
|
| 38 |
- Use words like "context", "document", or "text".
|
|
|
|
| 40 |
- Say "helpful", "available", "allowed", "indemnified", "excluded", etc.
|
| 41 |
- Use overly robotic passive constructions like "shall be indemnified".
|
| 42 |
- Dont Give In Message Like "Based On The Context "Or "Nothing Refered In The context" Like That Dont Give In Response Try To Give Answer For The Question Alone
|
| 43 |
+
- Over-explain or give long theory answers.
|
| 44 |
|
| 45 |
β
DO:
|
| 46 |
- Write in clean, informative language.
|
| 47 |
- Give complete answers in 2β3 sentences maximum.
|
| 48 |
|
| 49 |
|
| 50 |
+
π EXAMPLE ANSWERS:
|
| 51 |
+
- "Yes, the policy covers damage to personal property caused by fire, up to a limit of $50,000."
|
| 52 |
+
- "No, the policy does not cover pre-existing conditions."
|
| 53 |
+
- "The waiting period for coverage to begin is 30 days from the start date of the policy."
|
| 54 |
|
| 55 |
|
| 56 |
π€ OUTPUT FORMAT (strict):
|
pdf_parser.py
CHANGED
|
@@ -2,49 +2,85 @@ import fitz # PyMuPDF
|
|
| 2 |
import requests
|
| 3 |
from io import BytesIO
|
| 4 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
def _extract_text(page):
|
| 7 |
text = page.get_text()
|
| 8 |
return text.strip() if text and text.strip() else None
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
|
| 11 |
"""
|
| 12 |
-
Download PDF from URL, extract text
|
|
|
|
| 13 |
"""
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1):
|
| 33 |
"""
|
| 34 |
Parse a local PDF file, extract text in parallel, optionally chunk pages.
|
| 35 |
"""
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
if
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
| 2 |
import requests
|
| 3 |
from io import BytesIO
|
| 4 |
from concurrent.futures import ThreadPoolExecutor
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import pytesseract
|
| 7 |
+
import imghdr
|
| 8 |
|
| 9 |
def _extract_text(page):
|
| 10 |
text = page.get_text()
|
| 11 |
return text.strip() if text and text.strip() else None
|
| 12 |
|
| 13 |
+
def is_image(content):
|
| 14 |
+
return imghdr.what(None, h=content) in ["jpeg", "png", "bmp", "gif", "tiff", "webp"]
|
| 15 |
+
|
| 16 |
+
def extract_text_from_image_bytes(image_bytes):
|
| 17 |
+
image = Image.open(BytesIO(image_bytes))
|
| 18 |
+
return pytesseract.image_to_string(image).strip()
|
| 19 |
+
|
| 20 |
def parse_pdf_from_url_multithreaded(url, max_workers=2, chunk_size=1):
|
| 21 |
"""
|
| 22 |
+
Download document (PDF or Image) from URL, extract text accordingly.
|
| 23 |
+
Gracefully return fallback message if unsupported or failed.
|
| 24 |
"""
|
| 25 |
+
try:
|
| 26 |
+
res = requests.get(url)
|
| 27 |
+
content = res.content
|
| 28 |
+
content_type = res.headers.get("content-type", "").lower()
|
| 29 |
+
except Exception as e:
|
| 30 |
+
print(f"β Failed to download: {str(e)}")
|
| 31 |
+
return [f"No data found in this document (download error)"]
|
| 32 |
+
|
| 33 |
+
# Check for unsupported content
|
| 34 |
+
if "zip" in content_type or url.endswith(".zip"):
|
| 35 |
+
return ["No data found in this document (zip)"]
|
| 36 |
+
if "octet-stream" in content_type or url.endswith(".bin"):
|
| 37 |
+
return ["No data found in this document (bin)"]
|
| 38 |
+
|
| 39 |
+
# OCR for image files
|
| 40 |
+
if "image" in content_type or is_image(content):
|
| 41 |
+
print("π· Detected image file. Using OCR...")
|
| 42 |
+
try:
|
| 43 |
+
text = extract_text_from_image_bytes(content)
|
| 44 |
+
return [text] if text else ["No data found in this document (image empty)"]
|
| 45 |
+
except Exception as e:
|
| 46 |
+
print(f"β OCR failed: {str(e)}")
|
| 47 |
+
return [f"No data found in this document (image/OCR error)"]
|
| 48 |
+
|
| 49 |
+
# Try PDF fallback
|
| 50 |
+
try:
|
| 51 |
+
with fitz.open(stream=BytesIO(content), filetype="pdf") as doc:
|
| 52 |
+
pages = list(doc)
|
| 53 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 54 |
+
texts = list(executor.map(_extract_text, pages))
|
| 55 |
+
if chunk_size > 1:
|
| 56 |
+
chunks = []
|
| 57 |
+
for i in range(0, len(texts), chunk_size):
|
| 58 |
+
chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
|
| 59 |
+
if chunk:
|
| 60 |
+
chunks.append(chunk)
|
| 61 |
+
return chunks if chunks else ["No data found in this document (empty PDF)"]
|
| 62 |
+
return [t for t in texts if t] or ["No data found in this document (empty PDF)"]
|
| 63 |
+
except Exception as e:
|
| 64 |
+
print(f"β Failed to parse as PDF: {str(e)}")
|
| 65 |
+
return [f"No data found in this document (not PDF or corrupted)"]
|
| 66 |
|
| 67 |
def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1):
|
| 68 |
"""
|
| 69 |
Parse a local PDF file, extract text in parallel, optionally chunk pages.
|
| 70 |
"""
|
| 71 |
+
try:
|
| 72 |
+
with fitz.open(file_path) as doc:
|
| 73 |
+
pages = list(doc)
|
| 74 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 75 |
+
texts = list(executor.map(_extract_text, pages))
|
| 76 |
+
if chunk_size > 1:
|
| 77 |
+
chunks = []
|
| 78 |
+
for i in range(0, len(texts), chunk_size):
|
| 79 |
+
chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
|
| 80 |
+
if chunk:
|
| 81 |
+
chunks.append(chunk)
|
| 82 |
+
return chunks if chunks else ["No data found in this document (local PDF empty)"]
|
| 83 |
+
return [t for t in texts if t] or ["No data found in this document (local PDF empty)"]
|
| 84 |
+
except Exception as e:
|
| 85 |
+
print(f"β Failed to open local file: {str(e)}")
|
| 86 |
+
return [f"No data found in this document (local file error)"]
|
requirements.txt
CHANGED
|
@@ -7,4 +7,6 @@ PyMuPDF
|
|
| 7 |
python-dotenv
|
| 8 |
tf-keras
|
| 9 |
google-generativeai
|
|
|
|
|
|
|
| 10 |
|
|
|
|
| 7 |
python-dotenv
|
| 8 |
tf-keras
|
| 9 |
google-generativeai
|
| 10 |
+
pytesseract
|
| 11 |
+
Pillow
|
| 12 |
|