Spaces:
Running
Running
import os | |
from typing import List, Dict, Any | |
import fitz # PyMuPDF | |
import docx | |
import requests | |
import io | |
def process_text_file(file_url: str) -> List[Dict[str, Any]]: | |
_, extension = os.path.splitext(file_url) | |
extension = extension.lower() | |
if "?alt=media&token=" in extension: | |
extension = list(extension.split("?"))[0] | |
if extension == '.txt': | |
return process_txt(file_url) | |
elif extension == '.pdf': | |
return process_pdf(file_url) | |
elif extension == '.docx': | |
return process_docx(file_url) | |
else: | |
raise ValueError(f"Unsupported text file type: {extension}") | |
def process_txt(txt_url: str) -> List[Dict[str, Any]]: | |
# Fetch the TXT file content from the URL | |
response = requests.get(txt_url) | |
# Check if the request was successful | |
if response.status_code == 200: | |
content = response.text | |
return [{ | |
"file_name": os.path.basename(txt_url), | |
"text": content, | |
"page_number": 1 | |
}] | |
else: | |
print(f"Failed to fetch the TXT file. Status code: {response.status_code}") | |
return [] | |
def process_pdf(pdf_url: str) -> List[Dict[str, Any]]: | |
# Fetch the PDF file content from the URL | |
response = requests.get(pdf_url) | |
# Check if the request was successful | |
if response.status_code == 200: | |
# Load the PDF file from the response content | |
pdf_stream = io.BytesIO(response.content) | |
# Open the PDF file with PyMuPDF | |
pdf_document = fitz.open(stream=pdf_stream, filetype="pdf") | |
# Extract text from all pages | |
pdf_text = "" | |
for page_num in range(len(pdf_document)): | |
page = pdf_document.load_page(page_num) # Load the page | |
pdf_text += page.get_text("text") # Extract text from the page | |
return [{ | |
"file_name": os.path.basename(pdf_url), | |
"text": pdf_text | |
}] | |
else: | |
print(f"Failed to fetch the PDF file. Status code: {response.status_code}") | |
return [] | |
def process_docx(docx_url: str) -> List[Dict[str, Any]]: | |
# Fetch the DOCX file content from the URL | |
response = requests.get(docx_url) | |
# Check if the request was successful | |
if response.status_code == 200: | |
# Load the DOCX file from the response content | |
docx_stream = io.BytesIO(response.content) | |
# Open the DOCX file with python-docx | |
doc = docx.Document(docx_stream) | |
# Extract text from the DOCX file | |
content = "\n".join([para.text for para in doc.paragraphs]) | |
return [{ | |
"file_name": os.path.basename(docx_url), | |
"text": content, | |
"page_number": 1 # DOCX doesn't have pages, so just 1 | |
}] | |
else: | |
print(f"Failed to fetch the DOCX file. Status code: {response.status_code}") | |