import streamlit as st from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer from cltk.data.fetch import FetchCorpus import builtins import os import json DATA_FILE = "data.json" def load_data(): """Carica i dati salvati (token e frasi) dal file JSON.""" if os.path.exists(DATA_FILE): with open(DATA_FILE, "r", encoding="utf-8") as f: return json.load(f) return {"tokens": [], "phrases": {}} def save_data(data): """Salva i dati (token e frasi) nel file JSON.""" with open(DATA_FILE, "w", encoding="utf-8") as f: json.dump(data, f, indent=4) data = load_data() def save_token_and_phrase(token, phrase): if phrase not in data["phrases"]: data["phrases"][phrase] = token save_data(data) def get_valid_predictions(sentence, max_attempts=3, top_k=5): """Verifica se la frase è già salvata e usa il token corrispondente.""" if sentence in data["phrases"]: return [{"token_str": data["phrases"][sentence], "score": 1.0, "sequence": sentence.replace("[MASK]", data["phrases"][sentence])}] attempt = 0 filtered_predictions = [] while attempt < max_attempts: predictions = fill_mask_roberta(sentence, top_k=top_k) filtered_predictions = [ pred for pred in predictions if pred["token_str"] not in punctuation_marks ] if filtered_predictions: break attempt += 1 return filtered_predictions # UI per l'inserimento del token e delle frasi st.sidebar.header("Gestione Token e Frasi") token_input = st.sidebar.text_input("Inserisci il token:") phrase_input = st.sidebar.text_area("Inserisci la frase:") if st.sidebar.button("Salva Token e Frase"): if token_input and phrase_input: save_token_and_phrase(token_input, phrase_input) st.sidebar.success("Token e frase salvati con successo!") else: st.sidebar.warning("Inserisci sia un token che una frase validi.") existing_phrases = data.get("phrases", {}) st.sidebar.subheader("Frasi salvate:") st.sidebar.write("\n".join(existing_phrases.keys()) if existing_phrases else "Nessuna frase salvata.") _original_input = builtins.input def _always_yes(prompt=""): print(prompt, "Y") # per far vedere a log che abbiamo risposto 'Y' return "Y" builtins.input = _always_yes corpus_downloader = FetchCorpus(language="lat") corpus_downloader.import_corpus("lat_models_cltk") try: from cltk import NLP nlp_lat = NLP(language="lat") except ImportError: nlp_lat = None if "input_text_value" not in st.session_state: st.session_state["input_text_value"] = "Lorem ipsum dolor sit amet, [MASK] adipiscing elit." tokenizer_roberta = AutoTokenizer.from_pretrained("Cicciokr/Roberta-Base-Latin-Uncased") model_roberta = AutoModelForMaskedLM.from_pretrained("Cicciokr/Roberta-Base-Latin-Uncased") fill_mask_roberta = pipeline("fill-mask", model=model_roberta, tokenizer=tokenizer_roberta) punctuation_marks = {".", ",", ";", ":", "!", "?"} input_text = st.text_area( label="Testo:", height=150, key="input_text_value" ) if input_text: input_text_roberta = input_text.replace("[MASK]", "") predictions_roberta = get_valid_predictions(input_text_roberta) st.subheader("Risultati delle previsioni:") for pred in predictions_roberta: st.write(f" Token: {pred['token_str']}") st.write(f" Probabilità: {pred['score']:.4f}") st.write(f" Sequence: {pred['sequence']}") st.write("---") if nlp_lat is not None: st.subheader("Analisi Morfologica con CLTK") for pred in predictions_roberta: doc = nlp_lat(pred['token_str']) st.write(f"Frase: {pred['token_str']}") for w in doc.words: st.write( f"- **Token**: {w.string}\n" f" - Lemma: {w.lemma}\n" f" - UPOS: {w.upos}\n" f" - Morph: {w.features}\n" ) st.write("---") else: st.warning("CLTK non installato. Esegui 'pip install cltk' per abilitare l'analisi.")