verbisense / src /text_processor.py
HARISH20205's picture
first
c8c7a9e
import os
from typing import List, Dict, Any
import fitz # PyMuPDF
import docx
import requests
import io
def process_text_file(file_url: str) -> List[Dict[str, Any]]:
_, extension = os.path.splitext(file_url)
extension = extension.lower()
if "?alt=media&token=" in extension:
extension = list(extension.split("?"))[0]
if extension == '.txt':
return process_txt(file_url)
elif extension == '.pdf':
return process_pdf(file_url)
elif extension == '.docx':
return process_docx(file_url)
else:
raise ValueError(f"Unsupported text file type: {extension}")
def process_txt(txt_url: str) -> List[Dict[str, Any]]:
# Fetch the TXT file content from the URL
response = requests.get(txt_url)
# Check if the request was successful
if response.status_code == 200:
content = response.text
return [{
"file_name": os.path.basename(txt_url),
"text": content,
"page_number": 1
}]
else:
print(f"Failed to fetch the TXT file. Status code: {response.status_code}")
return []
def process_pdf(pdf_url: str) -> List[Dict[str, Any]]:
# Fetch the PDF file content from the URL
response = requests.get(pdf_url)
# Check if the request was successful
if response.status_code == 200:
# Load the PDF file from the response content
pdf_stream = io.BytesIO(response.content)
# Open the PDF file with PyMuPDF
pdf_document = fitz.open(stream=pdf_stream, filetype="pdf")
# Extract text from all pages
pdf_text = ""
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num) # Load the page
pdf_text += page.get_text("text") # Extract text from the page
return [{
"file_name": os.path.basename(pdf_url),
"text": pdf_text
}]
else:
print(f"Failed to fetch the PDF file. Status code: {response.status_code}")
return []
def process_docx(docx_url: str) -> List[Dict[str, Any]]:
# Fetch the DOCX file content from the URL
response = requests.get(docx_url)
# Check if the request was successful
if response.status_code == 200:
# Load the DOCX file from the response content
docx_stream = io.BytesIO(response.content)
# Open the DOCX file with python-docx
doc = docx.Document(docx_stream)
# Extract text from the DOCX file
content = "\n".join([para.text for para in doc.paragraphs])
return [{
"file_name": os.path.basename(docx_url),
"text": content,
"page_number": 1 # DOCX doesn't have pages, so just 1
}]
else:
print(f"Failed to fetch the DOCX file. Status code: {response.status_code}")