import os import spacy import nltk import torch from transformers import pipeline import PyPDF2 import gradio as gr # Initialize required tools nlp = spacy.load("en_core_web_sm") nltk.download('punkt') # Check if GPU is available and use it device = 0 if torch.cuda.is_available() else -1 analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device) # Define functions for text analysis def spacy_ner_analysis(text): doc = nlp(text) entities = [(ent.text, ent.label_) for ent in doc.ents] return entities def nltk_extract_sentences(text): sentences = nltk.tokenize.sent_tokenize(text) return sentences def nltk_extract_quotes(text): quotes = [] sentences = nltk.tokenize.sent_tokenize(text) for sentence in sentences: if '"' in sentence: quotes.append(sentence) return quotes def count_tokens(text): tokens = nltk.tokenize.word_tokenize(text) return len(tokens) def extract_pdf_text(file_path): with open(file_path, "rb") as pdf_file: pdf_reader = PyPDF2.PdfReader(pdf_file) text = "" for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] text += page.extract_text() return text def analyze_text(text): try: result = analyzer(text) return result except Exception as e: print(f"Error analyzing text: {str(e)}") return "" def process_text(text, output_directory, filename_prefix): spacy_entities = spacy_ner_analysis(text) sentences = nltk_extract_sentences(text) quotes = nltk_extract_quotes(text) token_count = count_tokens(text) # Save results to files with open(os.path.join(output_directory, f"{filename_prefix}_spacy_entities.txt"), "w", encoding="utf-8") as file: file.write(str(spacy_entities)) with open(os.path.join(output_directory, f"{filename_prefix}_sentences.txt"), "w", encoding="utf-8") as file: file.write("\n".join(sentences)) with open(os.path.join(output_directory, f"{filename_prefix}_quotes.txt"), "w", encoding="utf-8") as file: file.write("\n".join(quotes)) with open(os.path.join(output_directory, f"{filename_prefix}_token_count.txt"), "w", encoding="utf-8") as file: file.write(str(token_count)) def analyze_and_complete(file_path): if file_path.endswith(".pdf"): text = extract_pdf_text(file_path) else: with open(file_path, "r", encoding="utf-8") as file: text = file.read() output_directory = "/Users/Home/Library/Mobile Documents/com~apple~CloudDocs/osa/سيناريوهات/ليالي ألف ليلة" filename_prefix = os.path.splitext(os.path.basename(file_path))[0] process_text(text, output_directory, filename_prefix) spacy_entities = spacy_ner_analysis(text) sentences = nltk_extract_sentences(text) quotes = nltk_extract_quotes(text) token_count = count_tokens(text) return str(spacy_entities), "\n".join(sentences), "\n".join(quotes), str(token_count) # Define the Gradio interface interface = gr.Interface( fn=analyze_and_complete, inputs=gr.File(file_count="single", type="filepath"), outputs=["text", "text", "text", "text"], title="Movie Script Analyzer and Completer", description="Upload a text, PDF, or DOCX file to analyze and complete the movie script." ) if __name__ == "__main__": interface.launch(share=True)