Spaces:
Sleeping
Sleeping
File size: 4,133 Bytes
82e05bc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import streamlit as st
from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer
from cltk.data.fetch import FetchCorpus
import builtins
import os
import json
DATA_FILE = "data.json"
def load_data():
"""Carica i dati salvati (token e frasi) dal file JSON."""
if os.path.exists(DATA_FILE):
with open(DATA_FILE, "r", encoding="utf-8") as f:
return json.load(f)
return {"tokens": [], "phrases": {}}
def save_data(data):
"""Salva i dati (token e frasi) nel file JSON."""
with open(DATA_FILE, "w", encoding="utf-8") as f:
json.dump(data, f, indent=4)
data = load_data()
def save_token_and_phrase(token, phrase):
if phrase not in data["phrases"]:
data["phrases"][phrase] = token
save_data(data)
def get_valid_predictions(sentence, max_attempts=3, top_k=5):
"""Verifica se la frase è già salvata e usa il token corrispondente."""
if sentence in data["phrases"]:
return [{"token_str": data["phrases"][sentence], "score": 1.0, "sequence": sentence.replace("[MASK]", data["phrases"][sentence])}]
attempt = 0
filtered_predictions = []
while attempt < max_attempts:
predictions = fill_mask_roberta(sentence, top_k=top_k)
filtered_predictions = [
pred for pred in predictions if pred["token_str"] not in punctuation_marks
]
if filtered_predictions:
break
attempt += 1
return filtered_predictions
# UI per l'inserimento del token e delle frasi
st.sidebar.header("Gestione Token e Frasi")
token_input = st.sidebar.text_input("Inserisci il token:")
phrase_input = st.sidebar.text_area("Inserisci la frase:")
if st.sidebar.button("Salva Token e Frase"):
if token_input and phrase_input:
save_token_and_phrase(token_input, phrase_input)
st.sidebar.success("Token e frase salvati con successo!")
else:
st.sidebar.warning("Inserisci sia un token che una frase validi.")
existing_phrases = data.get("phrases", {})
st.sidebar.subheader("Frasi salvate:")
st.sidebar.write("\n".join(existing_phrases.keys()) if existing_phrases else "Nessuna frase salvata.")
_original_input = builtins.input
def _always_yes(prompt=""):
print(prompt, "Y") # per far vedere a log che abbiamo risposto 'Y'
return "Y"
builtins.input = _always_yes
corpus_downloader = FetchCorpus(language="lat")
corpus_downloader.import_corpus("lat_models_cltk")
try:
from cltk import NLP
nlp_lat = NLP(language="lat")
except ImportError:
nlp_lat = None
if "input_text_value" not in st.session_state:
st.session_state["input_text_value"] = "Lorem ipsum dolor sit amet, [MASK] adipiscing elit."
tokenizer_roberta = AutoTokenizer.from_pretrained("Cicciokr/Roberta-Base-Latin-Uncased")
model_roberta = AutoModelForMaskedLM.from_pretrained("Cicciokr/Roberta-Base-Latin-Uncased")
fill_mask_roberta = pipeline("fill-mask", model=model_roberta, tokenizer=tokenizer_roberta)
punctuation_marks = {".", ",", ";", ":", "!", "?"}
input_text = st.text_area(
label="Testo:",
height=150,
key="input_text_value"
)
if input_text:
input_text_roberta = input_text.replace("[MASK]", "<mask>")
predictions_roberta = get_valid_predictions(input_text_roberta)
st.subheader("Risultati delle previsioni:")
for pred in predictions_roberta:
st.write(f" Token: {pred['token_str']}")
st.write(f" Probabilità: {pred['score']:.4f}")
st.write(f" Sequence: {pred['sequence']}")
st.write("---")
if nlp_lat is not None:
st.subheader("Analisi Morfologica con CLTK")
for pred in predictions_roberta:
doc = nlp_lat(pred['token_str'])
st.write(f"Frase: {pred['token_str']}")
for w in doc.words:
st.write(
f"- **Token**: {w.string}\n"
f" - Lemma: {w.lemma}\n"
f" - UPOS: {w.upos}\n"
f" - Morph: {w.features}\n"
)
st.write("---")
else:
st.warning("CLTK non installato. Esegui 'pip install cltk' per abilitare l'analisi.") |