Spaces:
Runtime error
Runtime error
import os | |
import spacy | |
import nltk | |
import torch | |
from transformers import pipeline | |
import PyPDF2 | |
import gradio as gr | |
# Initialize required tools | |
nlp = spacy.load("en_core_web_sm") | |
nltk.download('punkt') | |
# Check if GPU is available and use it | |
device = 0 if torch.cuda.is_available() else -1 | |
analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device) | |
# Define functions for text analysis | |
def spacy_ner_analysis(text): | |
doc = nlp(text) | |
entities = [(ent.text, ent.label_) for ent in doc.ents] | |
return entities | |
def nltk_extract_sentences(text): | |
sentences = nltk.tokenize.sent_tokenize(text) | |
return sentences | |
def nltk_extract_quotes(text): | |
quotes = [] | |
sentences = nltk.tokenize.sent_tokenize(text) | |
for sentence in sentences: | |
if '"' in sentence: | |
quotes.append(sentence) | |
return quotes | |
def count_tokens(text): | |
tokens = nltk.tokenize.word_tokenize(text) | |
return len(tokens) | |
def extract_pdf_text(file_path): | |
with open(file_path, "rb") as pdf_file: | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
text = "" | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
text += page.extract_text() | |
return text | |
def analyze_text(text): | |
try: | |
result = analyzer(text) | |
return result | |
except Exception as e: | |
print(f"Error analyzing text: {str(e)}") | |
return "" | |
def process_text(text, output_directory, filename_prefix): | |
spacy_entities = spacy_ner_analysis(text) | |
sentences = nltk_extract_sentences(text) | |
quotes = nltk_extract_quotes(text) | |
token_count = count_tokens(text) | |
# Save results to files | |
with open(os.path.join(output_directory, f"{filename_prefix}_spacy_entities.txt"), "w", encoding="utf-8") as file: | |
file.write(str(spacy_entities)) | |
with open(os.path.join(output_directory, f"{filename_prefix}_sentences.txt"), "w", encoding="utf-8") as file: | |
file.write("\n".join(sentences)) | |
with open(os.path.join(output_directory, f"{filename_prefix}_quotes.txt"), "w", encoding="utf-8") as file: | |
file.write("\n".join(quotes)) | |
with open(os.path.join(output_directory, f"{filename_prefix}_token_count.txt"), "w", encoding="utf-8") as file: | |
file.write(str(token_count)) | |
def analyze_and_complete(file_path): | |
if file_path.endswith(".pdf"): | |
text = extract_pdf_text(file_path) | |
else: | |
with open(file_path, "r", encoding="utf-8") as file: | |
text = file.read() | |
output_directory = "/Users/Home/Library/Mobile Documents/com~apple~CloudDocs/osa/ุณููุงุฑูููุงุช/ููุงูู ุงููู ูููุฉ" | |
filename_prefix = os.path.splitext(os.path.basename(file_path))[0] | |
process_text(text, output_directory, filename_prefix) | |
spacy_entities = spacy_ner_analysis(text) | |
sentences = nltk_extract_sentences(text) | |
quotes = nltk_extract_quotes(text) | |
token_count = count_tokens(text) | |
return str(spacy_entities), "\n".join(sentences), "\n".join(quotes), str(token_count) | |
# Define the Gradio interface | |
interface = gr.Interface( | |
fn=analyze_and_complete, | |
inputs=gr.File(file_count="single", type="filepath"), | |
outputs=["text", "text", "text", "text"], | |
title="Movie Script Analyzer and Completer", | |
description="Upload a text, PDF, or DOCX file to analyze and complete the movie script." | |
) | |
if __name__ == "__main__": | |
interface.launch() | |