Spaces:
Sleeping
Sleeping
Commit
·
fa80eae
1
Parent(s):
7e40c67
progress spinner attempt and some pandas correction
Browse files- app.py +6 -7
- requirements.txt +3 -1
app.py
CHANGED
|
@@ -2,7 +2,7 @@ import streamlit as st
|
|
| 2 |
import pandas as pd
|
| 3 |
import time
|
| 4 |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 5 |
-
from transformers import
|
| 6 |
#from transformers import MarianMTModel, MarianTokenizer
|
| 7 |
import matplotlib.pyplot as plt
|
| 8 |
from pymystem3 import Mystem
|
|
@@ -41,16 +41,13 @@ def translate(text):
|
|
| 41 |
# Tokenize the input text
|
| 42 |
inputs = translation_tokenizer(text, return_tensors="pt", truncation=True)
|
| 43 |
|
| 44 |
-
# Get the number of tokens in the input
|
| 45 |
-
input_length = inputs.input_ids.shape[1]
|
| 46 |
-
|
| 47 |
# Set up a simple spinner
|
| 48 |
with tqdm(total=0, bar_format='{desc}', desc="Translating...") as pbar:
|
| 49 |
# Generate translation
|
| 50 |
translated_tokens = translation_model.generate(
|
| 51 |
**inputs,
|
| 52 |
num_beams=5,
|
| 53 |
-
max_length=
|
| 54 |
no_repeat_ngram_size=2,
|
| 55 |
early_stopping=True
|
| 56 |
)
|
|
@@ -112,9 +109,11 @@ def fuzzy_deduplicate(df, column, threshold=65):
|
|
| 112 |
def process_file(uploaded_file):
|
| 113 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
| 114 |
|
|
|
|
|
|
|
| 115 |
# Apply fuzzy deduplication
|
| 116 |
-
df = df.groupby('Объект').apply(lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)).reset_index(drop=True)
|
| 117 |
-
|
| 118 |
# Translate texts
|
| 119 |
translated_texts = []
|
| 120 |
progress_bar = st.progress(0)
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import time
|
| 4 |
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 5 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
| 6 |
#from transformers import MarianMTModel, MarianTokenizer
|
| 7 |
import matplotlib.pyplot as plt
|
| 8 |
from pymystem3 import Mystem
|
|
|
|
| 41 |
# Tokenize the input text
|
| 42 |
inputs = translation_tokenizer(text, return_tensors="pt", truncation=True)
|
| 43 |
|
|
|
|
|
|
|
|
|
|
| 44 |
# Set up a simple spinner
|
| 45 |
with tqdm(total=0, bar_format='{desc}', desc="Translating...") as pbar:
|
| 46 |
# Generate translation
|
| 47 |
translated_tokens = translation_model.generate(
|
| 48 |
**inputs,
|
| 49 |
num_beams=5,
|
| 50 |
+
max_length=len(text.split()) * 2, # Adjust as needed
|
| 51 |
no_repeat_ngram_size=2,
|
| 52 |
early_stopping=True
|
| 53 |
)
|
|
|
|
| 109 |
def process_file(uploaded_file):
|
| 110 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
| 111 |
|
| 112 |
+
|
| 113 |
+
|
| 114 |
# Apply fuzzy deduplication
|
| 115 |
+
df = df.groupby('Объект', group_keys=False).apply(lambda x: fuzzy_deduplicate(x, 'Выдержки из текста', 65)).reset_index(drop=True)
|
| 116 |
+
|
| 117 |
# Translate texts
|
| 118 |
translated_texts = []
|
| 119 |
progress_bar = st.progress(0)
|
requirements.txt
CHANGED
|
@@ -3,8 +3,10 @@ pandas
|
|
| 3 |
vaderSentiment
|
| 4 |
transformers>=4.30.0
|
| 5 |
torch
|
|
|
|
| 6 |
sentencepiece
|
| 7 |
pymystem3
|
| 8 |
openpyxl
|
| 9 |
rapidfuzz
|
| 10 |
-
matplotlib
|
|
|
|
|
|
| 3 |
vaderSentiment
|
| 4 |
transformers>=4.30.0
|
| 5 |
torch
|
| 6 |
+
tqdm
|
| 7 |
sentencepiece
|
| 8 |
pymystem3
|
| 9 |
openpyxl
|
| 10 |
rapidfuzz
|
| 11 |
+
matplotlib
|
| 12 |
+
sacremoses
|