Cicciokr's picture
Create app.py
82e05bc verified
raw
history blame
4.13 kB
import streamlit as st
from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer
from cltk.data.fetch import FetchCorpus
import builtins
import os
import json
DATA_FILE = "data.json"
def load_data():
"""Carica i dati salvati (token e frasi) dal file JSON."""
if os.path.exists(DATA_FILE):
with open(DATA_FILE, "r", encoding="utf-8") as f:
return json.load(f)
return {"tokens": [], "phrases": {}}
def save_data(data):
"""Salva i dati (token e frasi) nel file JSON."""
with open(DATA_FILE, "w", encoding="utf-8") as f:
json.dump(data, f, indent=4)
data = load_data()
def save_token_and_phrase(token, phrase):
if phrase not in data["phrases"]:
data["phrases"][phrase] = token
save_data(data)
def get_valid_predictions(sentence, max_attempts=3, top_k=5):
"""Verifica se la frase è già salvata e usa il token corrispondente."""
if sentence in data["phrases"]:
return [{"token_str": data["phrases"][sentence], "score": 1.0, "sequence": sentence.replace("[MASK]", data["phrases"][sentence])}]
attempt = 0
filtered_predictions = []
while attempt < max_attempts:
predictions = fill_mask_roberta(sentence, top_k=top_k)
filtered_predictions = [
pred for pred in predictions if pred["token_str"] not in punctuation_marks
]
if filtered_predictions:
break
attempt += 1
return filtered_predictions
# UI per l'inserimento del token e delle frasi
st.sidebar.header("Gestione Token e Frasi")
token_input = st.sidebar.text_input("Inserisci il token:")
phrase_input = st.sidebar.text_area("Inserisci la frase:")
if st.sidebar.button("Salva Token e Frase"):
if token_input and phrase_input:
save_token_and_phrase(token_input, phrase_input)
st.sidebar.success("Token e frase salvati con successo!")
else:
st.sidebar.warning("Inserisci sia un token che una frase validi.")
existing_phrases = data.get("phrases", {})
st.sidebar.subheader("Frasi salvate:")
st.sidebar.write("\n".join(existing_phrases.keys()) if existing_phrases else "Nessuna frase salvata.")
_original_input = builtins.input
def _always_yes(prompt=""):
print(prompt, "Y") # per far vedere a log che abbiamo risposto 'Y'
return "Y"
builtins.input = _always_yes
corpus_downloader = FetchCorpus(language="lat")
corpus_downloader.import_corpus("lat_models_cltk")
try:
from cltk import NLP
nlp_lat = NLP(language="lat")
except ImportError:
nlp_lat = None
if "input_text_value" not in st.session_state:
st.session_state["input_text_value"] = "Lorem ipsum dolor sit amet, [MASK] adipiscing elit."
tokenizer_roberta = AutoTokenizer.from_pretrained("Cicciokr/Roberta-Base-Latin-Uncased")
model_roberta = AutoModelForMaskedLM.from_pretrained("Cicciokr/Roberta-Base-Latin-Uncased")
fill_mask_roberta = pipeline("fill-mask", model=model_roberta, tokenizer=tokenizer_roberta)
punctuation_marks = {".", ",", ";", ":", "!", "?"}
input_text = st.text_area(
label="Testo:",
height=150,
key="input_text_value"
)
if input_text:
input_text_roberta = input_text.replace("[MASK]", "<mask>")
predictions_roberta = get_valid_predictions(input_text_roberta)
st.subheader("Risultati delle previsioni:")
for pred in predictions_roberta:
st.write(f" Token: {pred['token_str']}")
st.write(f" Probabilità: {pred['score']:.4f}")
st.write(f" Sequence: {pred['sequence']}")
st.write("---")
if nlp_lat is not None:
st.subheader("Analisi Morfologica con CLTK")
for pred in predictions_roberta:
doc = nlp_lat(pred['token_str'])
st.write(f"Frase: {pred['token_str']}")
for w in doc.words:
st.write(
f"- **Token**: {w.string}\n"
f" - Lemma: {w.lemma}\n"
f" - UPOS: {w.upos}\n"
f" - Morph: {w.features}\n"
)
st.write("---")
else:
st.warning("CLTK non installato. Esegui 'pip install cltk' per abilitare l'analisi.")