Jaane commited on
Commit
4272847
·
verified ·
1 Parent(s): dfeb6d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -102
app.py CHANGED
@@ -2,26 +2,18 @@ import gradio as gr
2
  import torch
3
  from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline
4
  from sentence_transformers import SentenceTransformer, util
5
- import openai
6
- import random
7
- import re
8
  import requests
9
  import warnings
10
- from transformers import logging
11
  import os
12
- import tensorflow as tf
13
 
14
- GROQ_API_KEY = os.getenv("GROQ_API_KEY")
15
- os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # 0 = all messages, 1 = INFO, 2 = WARNING, 3 = ERROR
16
- tf.get_logger().setLevel('ERROR')
17
-
18
- # Suppress Python warnings
19
  warnings.filterwarnings("ignore", category=FutureWarning) # Suppress FutureWarnings
20
  warnings.filterwarnings("ignore", category=UserWarning) # Suppress UserWarnings
21
- warnings.filterwarnings("ignore") # Suppress all warnings (optional)
22
 
23
- # Suppress Hugging Face Transformers warnings
24
- logging.set_verbosity_error()
25
  # GPT-powered sentence segmentation function
26
  def segment_into_sentences_groq(passage):
27
  headers = {
@@ -32,128 +24,113 @@ def segment_into_sentences_groq(passage):
32
  "model": "llama3-8b-8192",
33
  "messages": [
34
  {
35
- "role": "system",
36
- "content": "you are to segment the sentence by adding '1!2@3#' at the end of each sentence. Return only the segmented sentences only return the modified passage and nothing else do not add your responses"
37
- },
38
  {
39
  "role": "user",
40
- "content": f"you are to segment the sentence by adding '1!2@3#' at the end of each sentence. Return only the segmented sentences only return the modified passage and nothing else do not add your responses. here is the passage:{passage}"
41
  }
42
  ],
43
- "temperature": 1.0,
44
- "max_tokens": 8192
45
  }
46
- print("response sent")
47
  response = requests.post("https://api.groq.com/openai/v1/chat/completions", json=payload, headers=headers)
48
- print("response recieved")
49
  if response.status_code == 200:
50
- data = response.json()
51
  try:
52
- segmented_text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
53
- print("SOP segmented")
54
- # Split sentences based on the custom token
55
  sentences = segmented_text.split("1!2@3#")
56
  return [sentence.strip() for sentence in sentences if sentence.strip()]
57
- except (IndexError, KeyError):
58
  raise ValueError("Unexpected response structure from Groq API.")
59
  else:
60
  raise ValueError(f"Groq API error: {response.text}")
61
 
62
 
63
-
64
  class TextEnhancer:
65
  def __init__(self):
66
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
 
67
 
68
- # Initialize paraphrase model
 
 
 
69
  self.paraphrase_tokenizer = AutoTokenizer.from_pretrained("prithivida/parrot_paraphraser_on_T5")
70
  self.paraphrase_model = T5ForConditionalGeneration.from_pretrained("prithivida/parrot_paraphraser_on_T5").to(self.device)
71
- print("paraphraser loaded")
72
- # Initialize grammar correction
73
  self.grammar_pipeline = pipeline(
74
  "text2text-generation",
75
  model="Grammarly/coedit-large",
76
  device=0 if self.device == "cuda" else -1
77
  )
78
- print("grammar check loaded")
79
- # Initialize semantic similarity model
80
  self.similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(self.device)
81
- print("sementics model loaded")
82
 
83
- def enhance_text(self, text, min_similarity=0.8, max_variations=3):
84
- # Use GPT for sentence segmentation
85
  sentences = segment_into_sentences_groq(text)
86
 
87
- enhanced_sentences = []
 
88
 
89
- for sentence in sentences:
90
- if not sentence.strip():
91
- continue
92
-
93
- inputs = self.paraphrase_tokenizer(
94
- f"paraphrase: {sentence}",
95
- return_tensors="pt",
96
- padding=True,
97
- max_length=150,
98
- truncation=True
99
- ).to(self.device)
 
 
 
 
 
100
 
101
- outputs = self.paraphrase_model.generate(
102
- **inputs,
103
- max_length=len(sentence.split()) + 20,
104
- num_return_sequences=max_variations,
105
- num_beams=max_variations,
106
- temperature=0.7
107
- )
108
 
109
-
110
- paraphrases = [
111
- self.paraphrase_tokenizer.decode(output, skip_special_tokens=True)
112
- for output in outputs
113
- ]
114
-
115
- sentence_embedding = self.similarity_model.encode(sentence)
116
- paraphrase_embeddings = self.similarity_model.encode(paraphrases)
117
- similarities = util.cos_sim(sentence_embedding, paraphrase_embeddings)
118
-
119
- valid_paraphrases = [
120
- para for para, sim in zip(paraphrases, similarities[0])
121
- if sim >= min_similarity
122
- ]
123
-
124
- if valid_paraphrases:
125
- corrected = self.grammar_pipeline(
126
- valid_paraphrases[0],
127
- max_length=150,
128
- num_return_sequences=1
129
- )[0]["generated_text"]
130
-
131
- corrected = self._humanize_text(corrected)
132
- enhanced_sentences.append(corrected)
133
- else:
134
- enhanced_sentences.append(sentence)
135
- print(sentence)
136
 
137
- enhanced_text = ". ".join(sentence.rstrip(".") for sentence in enhanced_sentences) + "."
138
- return enhanced_text
 
 
 
 
 
 
 
 
 
 
139
 
140
  def _humanize_text(self, text):
141
- """
142
- Introduce small variations to make text appear more 'human-like'
143
- """
144
- # Randomly replace contractions in some sentences
145
- contractions = {"can't": "cannot", "won't": "will not", "I'm": "I am", "it's": "it is"}
146
  words = text.split()
147
  text = " ".join([contractions.get(word, word) if random.random() > 0.9 else word for word in words])
148
 
149
- # Add optional comma variations for natural breaks
150
  if random.random() > 0.7:
151
  text = text.replace(" and ", ", and ")
152
-
153
- # Minor variations in sentence structure
154
- if random.random() > 0.5:
155
- text = text.replace(" is ", " happens to be ")
156
-
157
  return text
158
 
159
 
@@ -162,12 +139,7 @@ def create_interface():
162
 
163
  def process_text(text, similarity_threshold):
164
  try:
165
- enhanced = enhancer.enhance_text(
166
- text,
167
- min_similarity=similarity_threshold / 100
168
- )
169
- print("grammar enhanced")
170
- return enhanced
171
  except Exception as e:
172
  return f"Error: {str(e)}"
173
 
@@ -188,9 +160,8 @@ def create_interface():
188
  ],
189
  outputs=gr.Textbox(label="Enhanced Text", lines=10),
190
  title="Text Enhancement System",
191
- description="Improve text quality while preserving original meaning"
192
  )
193
-
194
  return interface
195
 
196
 
 
2
  import torch
3
  from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline
4
  from sentence_transformers import SentenceTransformer, util
 
 
 
5
  import requests
6
  import warnings
 
7
  import os
8
+ from concurrent.futures import ThreadPoolExecutor
9
 
10
+ # Set environment variables and suppress warnings
11
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Reduce TensorFlow verbosity
 
 
 
12
  warnings.filterwarnings("ignore", category=FutureWarning) # Suppress FutureWarnings
13
  warnings.filterwarnings("ignore", category=UserWarning) # Suppress UserWarnings
 
14
 
15
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
16
+
17
  # GPT-powered sentence segmentation function
18
  def segment_into_sentences_groq(passage):
19
  headers = {
 
24
  "model": "llama3-8b-8192",
25
  "messages": [
26
  {
27
+ "role": "system",
28
+ "content": "you are to segment the sentence by adding '1!2@3#' at the end of each sentence. Return only the segmented sentences, nothing else."
29
+ },
30
  {
31
  "role": "user",
32
+ "content": f"Segment this passage into sentences with '1!2@3#' as a delimiter: {passage}"
33
  }
34
  ],
35
+ "temperature": 0.7,
36
+ "max_tokens": 1024
37
  }
38
+
39
  response = requests.post("https://api.groq.com/openai/v1/chat/completions", json=payload, headers=headers)
 
40
  if response.status_code == 200:
 
41
  try:
42
+ segmented_text = response.json()["choices"][0]["message"]["content"]
 
 
43
  sentences = segmented_text.split("1!2@3#")
44
  return [sentence.strip() for sentence in sentences if sentence.strip()]
45
+ except (KeyError, IndexError):
46
  raise ValueError("Unexpected response structure from Groq API.")
47
  else:
48
  raise ValueError(f"Groq API error: {response.text}")
49
 
50
 
 
51
  class TextEnhancer:
52
  def __init__(self):
53
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
54
+ self.executor = ThreadPoolExecutor(max_workers=3) # Parallel processing pool
55
 
56
+ # Load models
57
+ self._load_models()
58
+
59
+ def _load_models(self):
60
  self.paraphrase_tokenizer = AutoTokenizer.from_pretrained("prithivida/parrot_paraphraser_on_T5")
61
  self.paraphrase_model = T5ForConditionalGeneration.from_pretrained("prithivida/parrot_paraphraser_on_T5").to(self.device)
62
+
 
63
  self.grammar_pipeline = pipeline(
64
  "text2text-generation",
65
  model="Grammarly/coedit-large",
66
  device=0 if self.device == "cuda" else -1
67
  )
68
+
 
69
  self.similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(self.device)
 
70
 
71
+ def enhance_text(self, text, min_similarity=0.8):
 
72
  sentences = segment_into_sentences_groq(text)
73
 
74
+ # Process sentences in parallel
75
+ results = list(self.executor.map(lambda s: self._process_sentence(s, min_similarity), sentences))
76
 
77
+ # Join enhanced sentences into a single text
78
+ enhanced_text = ". ".join(results).strip() + "."
79
+ return enhanced_text
80
+
81
+ def _process_sentence(self, sentence, min_similarity):
82
+ if not sentence.strip():
83
+ return sentence
84
+
85
+ # Generate paraphrases
86
+ inputs = self.paraphrase_tokenizer(
87
+ f"paraphrase: {sentence}",
88
+ return_tensors="pt",
89
+ padding=True,
90
+ max_length=150,
91
+ truncation=True
92
+ ).to(self.device)
93
 
94
+ outputs = self.paraphrase_model.generate(
95
+ **inputs,
96
+ max_length=len(sentence.split()) + 20,
97
+ num_return_sequences=3,
98
+ num_beams=3,
99
+ temperature=0.7
100
+ )
101
 
102
+ paraphrases = [
103
+ self.paraphrase_tokenizer.decode(output, skip_special_tokens=True)
104
+ for output in outputs
105
+ ]
106
+
107
+ # Calculate semantic similarity
108
+ sentence_embedding = self.similarity_model.encode(sentence, convert_to_tensor=True)
109
+ paraphrase_embeddings = self.similarity_model.encode(paraphrases, convert_to_tensor=True)
110
+ similarities = util.cos_sim(sentence_embedding, paraphrase_embeddings).squeeze()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
+ # Filter paraphrases by similarity
113
+ valid_paraphrases = [
114
+ para for para, sim in zip(paraphrases, similarities)
115
+ if sim >= min_similarity
116
+ ]
117
+
118
+ # Grammar correction for the most similar paraphrase
119
+ if valid_paraphrases:
120
+ corrected = self.grammar_pipeline(valid_paraphrases[0])[0]["generated_text"]
121
+ return self._humanize_text(corrected)
122
+ else:
123
+ return sentence
124
 
125
  def _humanize_text(self, text):
126
+ # Introduce minor variations to mimic human-written text
127
+ import random
128
+ contractions = {"can't": "cannot", "won't": "will not", "it's": "it is"}
 
 
129
  words = text.split()
130
  text = " ".join([contractions.get(word, word) if random.random() > 0.9 else word for word in words])
131
 
 
132
  if random.random() > 0.7:
133
  text = text.replace(" and ", ", and ")
 
 
 
 
 
134
  return text
135
 
136
 
 
139
 
140
  def process_text(text, similarity_threshold):
141
  try:
142
+ return enhancer.enhance_text(text, min_similarity=similarity_threshold / 100)
 
 
 
 
 
143
  except Exception as e:
144
  return f"Error: {str(e)}"
145
 
 
160
  ],
161
  outputs=gr.Textbox(label="Enhanced Text", lines=10),
162
  title="Text Enhancement System",
163
+ description="Improve text quality while preserving original meaning.",
164
  )
 
165
  return interface
166
 
167