Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer | |
from cltk.data.fetch import FetchCorpus | |
import builtins | |
import os | |
import json | |
DATA_FILE = "data.json" | |
def load_data(): | |
"""Carica i dati salvati (token e frasi) dal file JSON.""" | |
if os.path.exists(DATA_FILE): | |
with open(DATA_FILE, "r", encoding="utf-8") as f: | |
return json.load(f) | |
return {"tokens": [], "phrases": {}} | |
def save_data(data): | |
"""Salva i dati (token e frasi) nel file JSON.""" | |
with open(DATA_FILE, "w", encoding="utf-8") as f: | |
json.dump(data, f, indent=4) | |
data = load_data() | |
def save_token_and_phrase(token, phrase): | |
if phrase not in data["phrases"]: | |
data["phrases"][phrase] = token | |
save_data(data) | |
def get_valid_predictions(sentence, max_attempts=3, top_k=5): | |
"""Verifica se la frase è già salvata e usa il token corrispondente.""" | |
if sentence in data["phrases"]: | |
return [{"token_str": data["phrases"][sentence], "score": 1.0, "sequence": sentence.replace("[MASK]", data["phrases"][sentence])}] | |
attempt = 0 | |
filtered_predictions = [] | |
while attempt < max_attempts: | |
predictions = fill_mask_roberta(sentence, top_k=top_k) | |
filtered_predictions = [ | |
pred for pred in predictions if pred["token_str"] not in punctuation_marks | |
] | |
if filtered_predictions: | |
break | |
attempt += 1 | |
return filtered_predictions | |
# UI per l'inserimento del token e delle frasi | |
st.sidebar.header("Gestione Token e Frasi") | |
token_input = st.sidebar.text_input("Inserisci il token:") | |
phrase_input = st.sidebar.text_area("Inserisci la frase:") | |
if st.sidebar.button("Salva Token e Frase"): | |
if token_input and phrase_input: | |
save_token_and_phrase(token_input, phrase_input) | |
st.sidebar.success("Token e frase salvati con successo!") | |
else: | |
st.sidebar.warning("Inserisci sia un token che una frase validi.") | |
existing_phrases = data.get("phrases", {}) | |
st.sidebar.subheader("Frasi salvate:") | |
st.sidebar.write("\n".join(existing_phrases.keys()) if existing_phrases else "Nessuna frase salvata.") | |
_original_input = builtins.input | |
def _always_yes(prompt=""): | |
print(prompt, "Y") # per far vedere a log che abbiamo risposto 'Y' | |
return "Y" | |
builtins.input = _always_yes | |
corpus_downloader = FetchCorpus(language="lat") | |
corpus_downloader.import_corpus("lat_models_cltk") | |
try: | |
from cltk import NLP | |
nlp_lat = NLP(language="lat") | |
except ImportError: | |
nlp_lat = None | |
if "input_text_value" not in st.session_state: | |
st.session_state["input_text_value"] = "Lorem ipsum dolor sit amet, [MASK] adipiscing elit." | |
tokenizer_roberta = AutoTokenizer.from_pretrained("Cicciokr/Roberta-Base-Latin-Uncased") | |
model_roberta = AutoModelForMaskedLM.from_pretrained("Cicciokr/Roberta-Base-Latin-Uncased") | |
fill_mask_roberta = pipeline("fill-mask", model=model_roberta, tokenizer=tokenizer_roberta) | |
punctuation_marks = {".", ",", ";", ":", "!", "?"} | |
input_text = st.text_area( | |
label="Testo:", | |
height=150, | |
key="input_text_value" | |
) | |
if input_text: | |
input_text_roberta = input_text.replace("[MASK]", "<mask>") | |
predictions_roberta = get_valid_predictions(input_text_roberta) | |
st.subheader("Risultati delle previsioni:") | |
for pred in predictions_roberta: | |
st.write(f" Token: {pred['token_str']}") | |
st.write(f" Probabilità: {pred['score']:.4f}") | |
st.write(f" Sequence: {pred['sequence']}") | |
st.write("---") | |
if nlp_lat is not None: | |
st.subheader("Analisi Morfologica con CLTK") | |
for pred in predictions_roberta: | |
doc = nlp_lat(pred['token_str']) | |
st.write(f"Frase: {pred['token_str']}") | |
for w in doc.words: | |
st.write( | |
f"- **Token**: {w.string}\n" | |
f" - Lemma: {w.lemma}\n" | |
f" - UPOS: {w.upos}\n" | |
f" - Morph: {w.features}\n" | |
) | |
st.write("---") | |
else: | |
st.warning("CLTK non installato. Esegui 'pip install cltk' per abilitare l'analisi.") |