Spaces:
Running
Running
File size: 3,812 Bytes
1cdf555 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import json as js
import os
import re
from typing import List
import fasttext
import gradio as gr
import joblib
import omikuji
from huggingface_hub import snapshot_download
from install_packages import download_model
download_model('https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin', 'lid.176.bin')
# Download the model files from Hugging Face
for repo_id in ['kapllan/omikuji-bonsai-parliament-de-spacy', 'kapllan/omikuji-bonsai-parliament-fr-spacy',
'kapllan/omikuji-bonsai-parliament-it-spacy']:
if not os.path.exists(repo_id):
os.makedirs(repo_id)
model_dir = snapshot_download(repo_id=repo_id, local_dir=repo_id)
lang_model = fasttext.load_model('lid.176.bin')
with open('./id2label.json', 'r') as f:
id2label = js.load(f)
def map_language(language: str) -> str:
language_mapping = {'de': 'German',
'it': 'Italian',
'fr': 'French'}
if language in language_mapping.keys():
return language_mapping[language]
else:
return language
def find_model(language: str):
vectorizer, model = None, None
if language in ['de', 'fr', 'it']:
path_to_vectorizer = f'./kapllan/omikuji-bonsai-parliament-{language}-spacy/vectorizer'
path_to_model = f'./kapllan/omikuji-bonsai-parliament-{language}-spacy/omikuji-model'
vectorizer = joblib.load(path_to_vectorizer)
model = omikuji.Model.load(path_to_model)
return vectorizer, model
def predict_lang(text: str) -> str:
text = re.sub(r'\n', '', text) # Remove linebreaks because fasttext cannot process that otherwise
predictions = lang_model.predict(text, k=1) # returns top 2 matching languages
language = predictions[0][0] # returns top 2 matching languages
language = re.sub(r'__label__', '', language) # returns top 2 matching languages
return language
def predict_topic(text: str) -> [List[str], str]:
results = []
language = predict_lang(text)
vectorizer, model = find_model(language)
language = map_language(language)
if vectorizer is not None:
texts = [text]
vector = vectorizer.transform(texts)
for row in vector:
if row.nnz == 0: # All zero vector, empty result
continue
feature_values = [(col, row[0, col]) for col in row.nonzero()[1]]
for subj_id, score in model.predict(feature_values, top_k=1000):
results.append((id2label[str(subj_id)], score))
return results, language
def topic_modeling(text: str, threshold: float) -> [List[str], str]:
# Prepare labels and scores for the plot
sorted_topics, language = predict_topic(text)
if len(sorted_topics) > 0 and language in ['German', 'French', 'Italian']:
sorted_topics = [t for t in sorted_topics if t[1] >= threshold]
else:
sorted_topics = []
return sorted_topics, language
with gr.Blocks() as iface:
gr.Markdown("# Topic Modeling")
gr.Markdown("Enter a document and get each topic along with its score.")
with gr.Row():
with gr.Column():
input_text = gr.Textbox(lines=10, placeholder="Enter a document")
submit_button = gr.Button("Submit")
threshold_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Score Threshold", value=0.0)
language_text = gr.Textbox(lines=1, placeholder="Detected language will be shown here...",
interactive=False, label="Detected Language")
with gr.Column():
output_data = gr.Dataframe(headers=["Label", "Score"])
submit_button.click(topic_modeling, inputs=[input_text, threshold_slider], outputs=[output_data, language_text])
# Launch the app
iface.launch(share=True)
|