Spaces:
Runtime error
Runtime error
File size: 2,522 Bytes
4ec5ed1 c3d42ed 4ec5ed1 7b4a700 4ec5ed1 7b4a700 6969f60 4ec5ed1 7b4a700 4ec5ed1 c3d42ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import os
import spacy
import nltk
import torch
from transformers import pipeline
import PyPDF2
import gradio as gr
# Download and initialize required tools
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
nltk.download('punkt')
# Check if GPU is available and use it
device = 0 if torch.cuda.is_available() else -1
analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device)
def spacy_ner_analysis(text):
doc = nlp(text)
entities = [(ent.text, ent.label_) for ent in doc.ents]
return entities
def nltk_extract_sentences(text):
sentences = nltk.tokenize.sent_tokenize(text)
return sentences
def nltk_extract_quotes(text):
quotes = []
sentences = nltk.tokenize.sent_tokenize(text)
for sentence in sentences:
if '"' in sentence:
quotes.append(sentence)
return quotes
def count_tokens(text):
tokens = nltk.tokenize.word_tokenize(text)
return len(tokens)
def extract_pdf_text(file_path):
with open(file_path, "rb") as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text()
return text
def analyze_and_complete(file_paths):
results = []
for file_path in file_paths:
if file_path.endswith(".pdf"):
text = extract_pdf_text(file_path)
else:
with open(file_path, "r", encoding="utf-8") as file:
text = file.read()
output_directory = "/Users/Home/Library/Mobile Documents/com~apple~CloudDocs/osa/ุณููุงุฑูููุงุช/ููุงูู ุงููู ูููุฉ"
filename_prefix = os.path.splitext(os.path.basename(file_path))[0]
spacy_entities = spacy_ner_analysis(text)
sentences = nltk_extract_sentences(text)
quotes = nltk_extract_quotes(text)
token_count = count_tokens(text)
results.append((str(spacy_entities), "\n".join(sentences), "\n".join(quotes), str(token_count)))
return results
# Define the Gradio interface
interface = gr.Interface(
fn=analyze_and_complete,
inputs=gr.File(file_count="multiple", type="filepath"),
outputs=["text", "text", "text", "text"],
title="Movie Script Analyzer and Completer",
description="Upload text, PDF, or DOCX files to analyze and complete the movie script."
)
if __name__ == "__main__":
interface.launch()
|