Spaces:

Jaane
/

SOP_improved

Sleeping

App Files Files Community

Jaane commited on Nov 18, 2024

Commit

4272847

verified ·

1 Parent(s): dfeb6d2

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -102

app.py CHANGED Viewed

@@ -2,26 +2,18 @@ import gradio as gr
 import torch
 from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline
 from sentence_transformers import SentenceTransformer, util
-import openai
-import random
-import re
 import requests
 import warnings
-from transformers import logging
 import os
-import tensorflow as tf
-GROQ_API_KEY = os.getenv("GROQ_API_KEY")
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # 0 = all messages, 1 = INFO, 2 = WARNING, 3 = ERROR
-tf.get_logger().setLevel('ERROR')
-# Suppress Python warnings
 warnings.filterwarnings("ignore", category=FutureWarning)  # Suppress FutureWarnings
 warnings.filterwarnings("ignore", category=UserWarning)    # Suppress UserWarnings
-warnings.filterwarnings("ignore")                          # Suppress all warnings (optional)
-# Suppress Hugging Face Transformers warnings
-logging.set_verbosity_error()
 # GPT-powered sentence segmentation function
 def segment_into_sentences_groq(passage):
     headers = {
@@ -32,128 +24,113 @@ def segment_into_sentences_groq(passage):
         "model": "llama3-8b-8192",
         "messages": [
             {
-            "role": "system",
-            "content": "you are to segment the sentence by adding '1!2@3#' at the end of each sentence. Return only the segmented sentences only return the modified passage and nothing else do not add your responses"
-        },
             {
                 "role": "user",
-                "content": f"you are to segment the sentence by adding '1!2@3#' at the end of each sentence. Return only the segmented sentences only return the modified passage and nothing else do not add your responses. here is the passage:{passage}"
             }
         ],
-        "temperature": 1.0,
-        "max_tokens": 8192
     }
-    print("response sent")
     response = requests.post("https://api.groq.com/openai/v1/chat/completions", json=payload, headers=headers)
-    print("response recieved")
     if response.status_code == 200:
-        data = response.json()
         try:
-            segmented_text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
-            print("SOP segmented")
-            # Split sentences based on the custom token
             sentences = segmented_text.split("1!2@3#")
             return [sentence.strip() for sentence in sentences if sentence.strip()]
-        except (IndexError, KeyError):
             raise ValueError("Unexpected response structure from Groq API.")
     else:
         raise ValueError(f"Groq API error: {response.text}")
 class TextEnhancer:
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        # Initialize paraphrase model
         self.paraphrase_tokenizer = AutoTokenizer.from_pretrained("prithivida/parrot_paraphraser_on_T5")
         self.paraphrase_model = T5ForConditionalGeneration.from_pretrained("prithivida/parrot_paraphraser_on_T5").to(self.device)
-        print("paraphraser loaded")
-        # Initialize grammar correction
         self.grammar_pipeline = pipeline(
             "text2text-generation",
             model="Grammarly/coedit-large",
             device=0 if self.device == "cuda" else -1
         )
-        print("grammar check loaded")
-        # Initialize semantic similarity model
         self.similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(self.device)
-        print("sementics model loaded")
-    def enhance_text(self, text, min_similarity=0.8, max_variations=3):
-        # Use GPT for sentence segmentation
         sentences = segment_into_sentences_groq(text)
-        enhanced_sentences = []
-        for sentence in sentences:
-            if not sentence.strip():
-                continue
-            inputs = self.paraphrase_tokenizer(
-                f"paraphrase: {sentence}",
-                return_tensors="pt",
-                padding=True,
-                max_length=150,
-                truncation=True
-            ).to(self.device)
-            outputs = self.paraphrase_model.generate(
-                **inputs,
-                max_length=len(sentence.split()) + 20,
-                num_return_sequences=max_variations,
-                num_beams=max_variations,
-                temperature=0.7
-            )
-            paraphrases = [
-                self.paraphrase_tokenizer.decode(output, skip_special_tokens=True)
-                for output in outputs
-            ]
-            sentence_embedding = self.similarity_model.encode(sentence)
-            paraphrase_embeddings = self.similarity_model.encode(paraphrases)
-            similarities = util.cos_sim(sentence_embedding, paraphrase_embeddings)
-            valid_paraphrases = [
-                para for para, sim in zip(paraphrases, similarities[0])
-                if sim >= min_similarity
-            ]
-            if valid_paraphrases:
-                corrected = self.grammar_pipeline(
-                    valid_paraphrases[0],
-                    max_length=150,
-                    num_return_sequences=1
-                )[0]["generated_text"]
-                corrected = self._humanize_text(corrected)
-                enhanced_sentences.append(corrected)
-            else:
-                enhanced_sentences.append(sentence)
-                print(sentence)
-        enhanced_text = ". ".join(sentence.rstrip(".") for sentence in enhanced_sentences) + "."
-        return enhanced_text
     def _humanize_text(self, text):
-        """
-        Introduce small variations to make text appear more 'human-like'
-        """
-        # Randomly replace contractions in some sentences
-        contractions = {"can't": "cannot", "won't": "will not", "I'm": "I am", "it's": "it is"}
         words = text.split()
         text = " ".join([contractions.get(word, word) if random.random() > 0.9 else word for word in words])
-        # Add optional comma variations for natural breaks
         if random.random() > 0.7:
             text = text.replace(" and ", ", and ")
-        # Minor variations in sentence structure
-        if random.random() > 0.5:
-            text = text.replace(" is ", " happens to be ")
         return text
@@ -162,12 +139,7 @@ def create_interface():
     def process_text(text, similarity_threshold):
         try:
-            enhanced = enhancer.enhance_text(
-                text,
-                min_similarity=similarity_threshold / 100
-            )
-            print("grammar enhanced")
-            return enhanced
         except Exception as e:
             return f"Error: {str(e)}"
@@ -188,9 +160,8 @@ def create_interface():
         ],
         outputs=gr.Textbox(label="Enhanced Text", lines=10),
         title="Text Enhancement System",
-        description="Improve text quality while preserving original meaning"
     )
     return interface

 import torch
 from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline
 from sentence_transformers import SentenceTransformer, util
 import requests
 import warnings
 import os
+from concurrent.futures import ThreadPoolExecutor
+# Set environment variables and suppress warnings
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Reduce TensorFlow verbosity
 warnings.filterwarnings("ignore", category=FutureWarning)  # Suppress FutureWarnings
 warnings.filterwarnings("ignore", category=UserWarning)    # Suppress UserWarnings
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 # GPT-powered sentence segmentation function
 def segment_into_sentences_groq(passage):
     headers = {
         "model": "llama3-8b-8192",
         "messages": [
             {
+                "role": "system",
+                "content": "you are to segment the sentence by adding '1!2@3#' at the end of each sentence. Return only the segmented sentences, nothing else."
+            },
             {
                 "role": "user",
+                "content": f"Segment this passage into sentences with '1!2@3#' as a delimiter: {passage}"
             }
         ],
+        "temperature": 0.7,
+        "max_tokens": 1024
     }
     response = requests.post("https://api.groq.com/openai/v1/chat/completions", json=payload, headers=headers)
     if response.status_code == 200:
         try:
+            segmented_text = response.json()["choices"][0]["message"]["content"]
             sentences = segmented_text.split("1!2@3#")
             return [sentence.strip() for sentence in sentences if sentence.strip()]
+        except (KeyError, IndexError):
             raise ValueError("Unexpected response structure from Groq API.")
     else:
         raise ValueError(f"Groq API error: {response.text}")
 class TextEnhancer:
     def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.executor = ThreadPoolExecutor(max_workers=3)  # Parallel processing pool
+        # Load models
+        self._load_models()
+    def _load_models(self):
         self.paraphrase_tokenizer = AutoTokenizer.from_pretrained("prithivida/parrot_paraphraser_on_T5")
         self.paraphrase_model = T5ForConditionalGeneration.from_pretrained("prithivida/parrot_paraphraser_on_T5").to(self.device)
         self.grammar_pipeline = pipeline(
             "text2text-generation",
             model="Grammarly/coedit-large",
             device=0 if self.device == "cuda" else -1
         )
         self.similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(self.device)
+    def enhance_text(self, text, min_similarity=0.8):
         sentences = segment_into_sentences_groq(text)
+        # Process sentences in parallel
+        results = list(self.executor.map(lambda s: self._process_sentence(s, min_similarity), sentences))
+        # Join enhanced sentences into a single text
+        enhanced_text = ". ".join(results).strip() + "."
+        return enhanced_text
+    def _process_sentence(self, sentence, min_similarity):
+        if not sentence.strip():
+            return sentence
+        # Generate paraphrases
+        inputs = self.paraphrase_tokenizer(
+            f"paraphrase: {sentence}",
+            return_tensors="pt",
+            padding=True,
+            max_length=150,
+            truncation=True
+        ).to(self.device)
+        outputs = self.paraphrase_model.generate(
+            **inputs,
+            max_length=len(sentence.split()) + 20,
+            num_return_sequences=3,
+            num_beams=3,
+            temperature=0.7
+        )
+        paraphrases = [
+            self.paraphrase_tokenizer.decode(output, skip_special_tokens=True)
+            for output in outputs
+        ]
+        # Calculate semantic similarity
+        sentence_embedding = self.similarity_model.encode(sentence, convert_to_tensor=True)
+        paraphrase_embeddings = self.similarity_model.encode(paraphrases, convert_to_tensor=True)
+        similarities = util.cos_sim(sentence_embedding, paraphrase_embeddings).squeeze()
+        # Filter paraphrases by similarity
+        valid_paraphrases = [
+            para for para, sim in zip(paraphrases, similarities)
+            if sim >= min_similarity
+        ]
+        # Grammar correction for the most similar paraphrase
+        if valid_paraphrases:
+            corrected = self.grammar_pipeline(valid_paraphrases[0])[0]["generated_text"]
+            return self._humanize_text(corrected)
+        else:
+            return sentence
     def _humanize_text(self, text):
+        # Introduce minor variations to mimic human-written text
+        import random
+        contractions = {"can't": "cannot", "won't": "will not", "it's": "it is"}
         words = text.split()
         text = " ".join([contractions.get(word, word) if random.random() > 0.9 else word for word in words])
         if random.random() > 0.7:
             text = text.replace(" and ", ", and ")
         return text
     def process_text(text, similarity_threshold):
         try:
+            return enhancer.enhance_text(text, min_similarity=similarity_threshold / 100)
         except Exception as e:
             return f"Error: {str(e)}"
         ],
         outputs=gr.Textbox(label="Enhanced Text", lines=10),
         title="Text Enhancement System",
+        description="Improve text quality while preserving original meaning.",
     )
     return interface