Spaces:
Sleeping
Sleeping
File size: 6,138 Bytes
6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c 4272847 6725d4c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import gradio as gr
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline
from sentence_transformers import SentenceTransformer, util
import requests
import warnings
import os
from concurrent.futures import ThreadPoolExecutor
# Set environment variables and suppress warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Reduce TensorFlow verbosity
warnings.filterwarnings("ignore", category=FutureWarning) # Suppress FutureWarnings
warnings.filterwarnings("ignore", category=UserWarning) # Suppress UserWarnings
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
# GPT-powered sentence segmentation function
def segment_into_sentences_groq(passage):
headers = {
"Authorization": f"Bearer {GROQ_API_KEY}",
"Content-Type": "application/json"
}
payload = {
"model": "llama3-8b-8192",
"messages": [
{
"role": "system",
"content": "you are to segment the sentence by adding '1!2@3#' at the end of each sentence. Return only the segmented sentences, nothing else."
},
{
"role": "user",
"content": f"Segment this passage into sentences with '1!2@3#' as a delimiter: {passage}"
}
],
"temperature": 0.7,
"max_tokens": 1024
}
response = requests.post("https://api.groq.com/openai/v1/chat/completions", json=payload, headers=headers)
if response.status_code == 200:
try:
segmented_text = response.json()["choices"][0]["message"]["content"]
sentences = segmented_text.split("1!2@3#")
return [sentence.strip() for sentence in sentences if sentence.strip()]
except (KeyError, IndexError):
raise ValueError("Unexpected response structure from Groq API.")
else:
raise ValueError(f"Groq API error: {response.text}")
class TextEnhancer:
def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.executor = ThreadPoolExecutor(max_workers=3) # Parallel processing pool
# Load models
self._load_models()
def _load_models(self):
self.paraphrase_tokenizer = AutoTokenizer.from_pretrained("prithivida/parrot_paraphraser_on_T5")
self.paraphrase_model = T5ForConditionalGeneration.from_pretrained("prithivida/parrot_paraphraser_on_T5").to(self.device)
self.grammar_pipeline = pipeline(
"text2text-generation",
model="Grammarly/coedit-large",
device=0 if self.device == "cuda" else -1
)
self.similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(self.device)
def enhance_text(self, text, min_similarity=0.8):
sentences = segment_into_sentences_groq(text)
# Process sentences in parallel
results = list(self.executor.map(lambda s: self._process_sentence(s, min_similarity), sentences))
# Join enhanced sentences into a single text
enhanced_text = ". ".join(results).strip() + "."
return enhanced_text
def _process_sentence(self, sentence, min_similarity):
if not sentence.strip():
return sentence
# Generate paraphrases
inputs = self.paraphrase_tokenizer(
f"paraphrase: {sentence}",
return_tensors="pt",
padding=True,
max_length=150,
truncation=True
).to(self.device)
outputs = self.paraphrase_model.generate(
**inputs,
max_length=len(sentence.split()) + 20,
num_return_sequences=3,
num_beams=3,
temperature=0.7
)
paraphrases = [
self.paraphrase_tokenizer.decode(output, skip_special_tokens=True)
for output in outputs
]
# Calculate semantic similarity
sentence_embedding = self.similarity_model.encode(sentence, convert_to_tensor=True)
paraphrase_embeddings = self.similarity_model.encode(paraphrases, convert_to_tensor=True)
similarities = util.cos_sim(sentence_embedding, paraphrase_embeddings).squeeze()
# Filter paraphrases by similarity
valid_paraphrases = [
para for para, sim in zip(paraphrases, similarities)
if sim >= min_similarity
]
# Grammar correction for the most similar paraphrase
if valid_paraphrases:
corrected = self.grammar_pipeline(valid_paraphrases[0])[0]["generated_text"]
return self._humanize_text(corrected)
else:
return sentence
def _humanize_text(self, text):
# Introduce minor variations to mimic human-written text
import random
contractions = {"can't": "cannot", "won't": "will not", "it's": "it is"}
words = text.split()
text = " ".join([contractions.get(word, word) if random.random() > 0.9 else word for word in words])
if random.random() > 0.7:
text = text.replace(" and ", ", and ")
return text
def create_interface():
enhancer = TextEnhancer()
def process_text(text, similarity_threshold):
try:
return enhancer.enhance_text(text, min_similarity=similarity_threshold / 100)
except Exception as e:
return f"Error: {str(e)}"
interface = gr.Interface(
fn=process_text,
inputs=[
gr.Textbox(
label="Input Text",
placeholder="Enter text to enhance...",
lines=10
),
gr.Slider(
minimum=50,
maximum=100,
value=80,
label="Minimum Semantic Similarity (%)"
)
],
outputs=gr.Textbox(label="Enhanced Text", lines=10),
title="Text Enhancement System",
description="Improve text quality while preserving original meaning.",
)
return interface
if __name__ == "__main__":
interface = create_interface()
interface.launch()
|