Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline | |
| from sentence_transformers import SentenceTransformer, util | |
| import requests | |
| import warnings | |
| import os | |
| from concurrent.futures import ThreadPoolExecutor | |
| # Set environment variables and suppress warnings | |
| os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Reduce TensorFlow verbosity | |
| warnings.filterwarnings("ignore", category=FutureWarning) # Suppress FutureWarnings | |
| warnings.filterwarnings("ignore", category=UserWarning) # Suppress UserWarnings | |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY") | |
| # GPT-powered sentence segmentation function | |
| def segment_into_sentences_groq(passage): | |
| headers = { | |
| "Authorization": f"Bearer {GROQ_API_KEY}", | |
| "Content-Type": "application/json" | |
| } | |
| payload = { | |
| "model": "llama3-8b-8192", | |
| "messages": [ | |
| { | |
| "role": "system", | |
| "content": "you are to segment the sentence by adding '1!2@3#' at the end of each sentence. Return only the segmented sentences, nothing else." | |
| }, | |
| { | |
| "role": "user", | |
| "content": f"Segment this passage into sentences with '1!2@3#' as a delimiter: {passage}" | |
| } | |
| ], | |
| "temperature": 0.7, | |
| "max_tokens": 1024 | |
| } | |
| response = requests.post("https://api.groq.com/openai/v1/chat/completions", json=payload, headers=headers) | |
| if response.status_code == 200: | |
| try: | |
| segmented_text = response.json()["choices"][0]["message"]["content"] | |
| sentences = segmented_text.split("1!2@3#") | |
| return [sentence.strip() for sentence in sentences if sentence.strip()] | |
| except (KeyError, IndexError): | |
| raise ValueError("Unexpected response structure from Groq API.") | |
| else: | |
| raise ValueError(f"Groq API error: {response.text}") | |
| class TextEnhancer: | |
| def __init__(self): | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| self.executor = ThreadPoolExecutor(max_workers=3) # Parallel processing pool | |
| # Load models | |
| self._load_models() | |
| def _load_models(self): | |
| self.paraphrase_tokenizer = AutoTokenizer.from_pretrained("prithivida/parrot_paraphraser_on_T5") | |
| self.paraphrase_model = T5ForConditionalGeneration.from_pretrained("prithivida/parrot_paraphraser_on_T5").to(self.device) | |
| self.grammar_pipeline = pipeline( | |
| "text2text-generation", | |
| model="Grammarly/coedit-large", | |
| device=0 if self.device == "cuda" else -1 | |
| ) | |
| self.similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(self.device) | |
| def enhance_text(self, text, min_similarity=0.8): | |
| sentences = segment_into_sentences_groq(text) | |
| # Process sentences in parallel | |
| results = list(self.executor.map(lambda s: self._process_sentence(s, min_similarity), sentences)) | |
| # Join enhanced sentences into a single text | |
| enhanced_text = ". ".join(results).strip() + "." | |
| return enhanced_text | |
| def _process_sentence(self, sentence, min_similarity): | |
| if not sentence.strip(): | |
| return sentence | |
| # Generate paraphrases | |
| inputs = self.paraphrase_tokenizer( | |
| f"paraphrase: {sentence}", | |
| return_tensors="pt", | |
| padding=True, | |
| max_length=150, | |
| truncation=True | |
| ).to(self.device) | |
| outputs = self.paraphrase_model.generate( | |
| **inputs, | |
| max_length=len(sentence.split()) + 20, | |
| num_return_sequences=3, | |
| num_beams=3, | |
| temperature=0.7 | |
| ) | |
| paraphrases = [ | |
| self.paraphrase_tokenizer.decode(output, skip_special_tokens=True) | |
| for output in outputs | |
| ] | |
| # Calculate semantic similarity | |
| sentence_embedding = self.similarity_model.encode(sentence, convert_to_tensor=True) | |
| paraphrase_embeddings = self.similarity_model.encode(paraphrases, convert_to_tensor=True) | |
| similarities = util.cos_sim(sentence_embedding, paraphrase_embeddings).squeeze() | |
| # Filter paraphrases by similarity | |
| valid_paraphrases = [ | |
| para for para, sim in zip(paraphrases, similarities) | |
| if sim >= min_similarity | |
| ] | |
| # Grammar correction for the most similar paraphrase | |
| if valid_paraphrases: | |
| corrected = self.grammar_pipeline(valid_paraphrases[0])[0]["generated_text"] | |
| return self._humanize_text(corrected) | |
| else: | |
| return sentence | |
| def _humanize_text(self, text): | |
| # Introduce minor variations to mimic human-written text | |
| import random | |
| contractions = {"can't": "cannot", "won't": "will not", "it's": "it is"} | |
| words = text.split() | |
| text = " ".join([contractions.get(word, word) if random.random() > 0.9 else word for word in words]) | |
| if random.random() > 0.7: | |
| text = text.replace(" and ", ", and ") | |
| return text | |
| def create_interface(): | |
| enhancer = TextEnhancer() | |
| def process_text(text, similarity_threshold): | |
| try: | |
| return enhancer.enhance_text(text, min_similarity=similarity_threshold / 100) | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| interface = gr.Interface( | |
| fn=process_text, | |
| inputs=[ | |
| gr.Textbox( | |
| label="Input Text", | |
| placeholder="Enter text to enhance...", | |
| lines=10 | |
| ), | |
| gr.Slider( | |
| minimum=50, | |
| maximum=100, | |
| value=80, | |
| label="Minimum Semantic Similarity (%)" | |
| ) | |
| ], | |
| outputs=gr.Textbox(label="Enhanced Text", lines=10), | |
| title="Text Enhancement System", | |
| description="Improve text quality while preserving original meaning.", | |
| ) | |
| return interface | |
| if __name__ == "__main__": | |
| interface = create_interface() | |
| interface.launch() | |