Spaces:

EdysorEdutech
/

human_final

Paused

App Files Files Community

EdysorEdutech commited on Jul 25

Commit

36a50be

verified ·

1 Parent(s): 479bdbb

Create app.py

Browse files

Files changed (1) hide show

app.py +1994 -0

app.py ADDED Viewed

	@@ -0,0 +1,1994 @@

+import gradio as gr
+import torch
+from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer, AutoModelForSeq2SeqLM
+from bs4 import BeautifulSoup, NavigableString, Tag
+import re
+import time
+import random
+import nltk
+from nltk.tokenize import sent_tokenize
+# Download required NLTK data
+try:
+    nltk.download('punkt', quiet=True)
+except:
+    pass
+# Try to import spaCy but make it optional
+try:
+    import spacy
+    SPACY_AVAILABLE = True
+except:
+    print("spaCy not available, using NLTK for sentence processing")
+    SPACY_AVAILABLE = False
+class HumanLikeVariations:
+    """Add human-like variations and intentional imperfections"""
+    def __init__(self):
+        # Common human writing patterns - EXPANDED for Originality AI
+        self.casual_transitions = [
+             "So, ", "Well, ", "Now, ", "Actually, ", "Basically, ",
+             "You know, ", "I mean, ", "Thing is, ", "Honestly, ",
+             "Look, ", "Listen, ", "See, ", "Okay, ", "Right, ",
+             "Anyway, ", "Besides, ", "Plus, ", "Also, ", "Oh, ",
+             "Hey, ", "Alright, ", "Sure, ", "Fine, ", "Obviously, ",
+             "Clearly, ", "Seriously, ", "Literally, ", "Frankly, ",
+             "To be honest, ", "Truth is, ", "In fact, ", "Believe it or not, ",
+             "Here's the thing, ", "Let me tell you, ", "Get this, ",
+             "Funny thing is, ", "Interestingly, ", "Surprisingly, ",
+             "Let's be real here, ", "Can we talk about ", "Quick question: ",
+             "Real talk: ", "Hot take: ", "Unpopular opinion: ", "Fun fact: ",
+             "Pro tip: ", "Side note: ", "Random thought: ", "Food for thought: ",
+             "Just saying, ", "Not gonna lie, ", "For what it's worth, ",
+             "If you ask me, ", "Between you and me, ", "Here's my take: ",
+             "Let's face it, ", "No kidding, ", "Seriously though, ",
+             "But wait, ", "Hold on, ", "Check this out: ", "Guess what? "
+        ]
+        self.filler_phrases = [
+            "kind of", "sort of", "pretty much", "basically", "actually",
+            "really", "just", "quite", "rather", "fairly", "totally",
+            "definitely", "probably", "maybe", "perhaps", "somehow",
+            "somewhat", "literally", "seriously", "honestly", "frankly",
+            "simply", "merely", "purely", "truly", "genuinely",
+            "absolutely", "completely", "entirely", "utterly", "practically",
+            "virtually", "essentially", "fundamentally", "generally", "typically",
+            "usually", "normally", "often", "sometimes", "occasionally",
+            "apparently", "evidently", "obviously", "clearly", "seemingly",
+            "arguably", "potentially", "possibly", "likely", "unlikely",
+            "more or less", "give or take", "so to speak", "if you will",
+            "per se", "as such", "in a way", "to some extent", "to a degree",
+            "I kid you not", "no joke", "for real", "not gonna lie",
+            "I'm telling you", "trust me", "believe me", "I swear",
+            "hands down", "without a doubt", "100%", "straight up",
+            "I think", "I feel like", "I guess", "I suppose", "seems like",
+            "appears to be", "might be", "could be", "tends to", "tends to be",
+            "in my experience", "from what I've seen", "as far as I know",
+            "to the best of my knowledge", "if I'm not mistaken", "correct me if I'm wrong",
+            "you know what", "here's the deal", "bottom line", "at any rate",
+            "all in all", "when you think about it", "come to think of it",
+            "now that I think about it", "if we're being honest", "to be fair"
+        ]
+        self.human_connectors = [
+            ", which means", ", so", ", because", ", since", ", although",
+            ". That's why", ". This means", ". So basically,", ". The thing is,",
+            ", and honestly", ", but here's the thing", ", though", ", however",
+            ". Plus,", ". Also,", ". Besides,", ". Moreover,", ". Furthermore,",
+            ", which is why", ", and that's because", ", given that", ", considering",
+            ". In other words,", ". Put simply,", ". To clarify,", ". That said,",
+            ", you see", ", you know", ", right?", ", okay?", ", yeah?",
+            ". Here's why:", ". Let me explain:", ". Think about it:",
+            ", if you ask me", ", in my opinion", ", from my perspective",
+            ". On the flip side,", ". On the other hand,", ". Conversely,",
+            ", not to mention", ", let alone", ", much less", ", aside from",
+            ". What's more,", ". Even better,", ". Even worse,", ". The catch is,",
+            ", believe it or not", ", surprisingly enough", ", interestingly enough",
+            ". Long story short,", ". Bottom line is,", ". Point being,",
+            ", as you might expect", ", as it turns out", ", as luck would have it",
+            ". And get this:", ". But wait, there's more:", ". Here's the kicker:",
+            ", and here's why", ", and here's the thing", ", but here's what happened",
+            ". Spoiler alert:", ". Plot twist:", ". Reality check:",
+            ", at the end of the day", ", when all is said and done", ", all things considered",
+            ". Make no mistake,", ". Don't get me wrong,", ". Let's not forget,",
+            ", between you and me", ", off the record", ", just between us",
+            ". And honestly?", ". But seriously,", ". And you know what?",
+            ", which brings me to", ". This reminds me of", ", speaking of which",
+            ". Funny enough,", ". Weird thing is,", ". Strange but true:",
+            ", and I mean", ". I'm not kidding when I say", ", and trust me on this"
+        ]
+        # NEW: Common human typos and variations
+        self.common_typos = {
+            "the": ["teh", "th", "hte"],
+            "and": ["adn", "nad", "an"],
+            "that": ["taht", "htat", "tha"],
+            "with": ["wiht", "wtih", "iwth"],
+            "have": ["ahve", "hvae", "hav"],
+            "from": ["form", "fro", "frmo"],
+            "they": ["tehy", "thye", "htey"],
+            "which": ["whihc", "wich", "whcih"],
+            "their": ["thier", "theri", "tehir"],
+            "would": ["woudl", "wuold", "woul"],
+            "there": ["tehre", "theer", "ther"],
+            "could": ["coudl", "cuold", "coud"],
+            "people": ["poeple", "peopel", "pepole"],
+            "through": ["thorugh", "throught", "trhough"],
+            "because": ["becuase", "becasue", "beacuse"],
+            "before": ["beofre", "befroe", "befor"],
+            "different": ["differnt", "differnet", "diferent"],
+            "between": ["bewteen", "betwen", "betewen"],
+            "important": ["improtant", "importnat", "importan"],
+            "information": ["infromation", "informaiton", "informaton"]
+        }
+        # NEW: Human-like sentence starters for variety
+        self.varied_starters = [
+            "When it comes to", "As for", "Regarding", "In terms of",
+            "With respect to", "Concerning", "Speaking of", "About",
+            "If we look at", "Looking at", "Considering", "Given",
+            "Taking into account", "Bear in mind that", "Keep in mind",
+            "It's worth noting that", "It should be noted that",
+            "One thing to consider is", "An important point is",
+            "What's interesting is", "What stands out is",
+            "The key here is", "The main thing is", "The point is",
+            "Here's what matters:", "Here's the deal:", "Here's something:",
+            "Let's not forget", "We should remember", "Don't forget",
+            "Think about it this way:", "Look at it like this:",
+            "Consider this:", "Picture this:", "Imagine this:",
+            "You might wonder", "You might ask", "You may think",
+            "Some people say", "Many believe", "It's often said",
+            "Research shows", "Studies indicate", "Evidence suggests",
+            "Experience tells us", "History shows", "Time has shown"
+        ]
+    def add_human_touch(self, text):
+        """Add subtle human-like imperfections - MORE AGGRESSIVE"""
+        sentences = text.split('. ')
+        modified_sentences = []
+        for i, sent in enumerate(sentences):
+            if not sent.strip():
+                continue
+            # Occasionally start with casual transition (25% chance - increased)
+            if i > 0 and random.random() < 0.25 and len(sent.split()) > 5:
+                transition = random.choice(self.casual_transitions)
+                sent = transition + sent[0].lower() + sent[1:] if len(sent) > 1 else sent
+            # Add filler words occasionally (20% chance - increased)
+            if random.random() < 0.2 and len(sent.split()) > 8:
+                words = sent.split()
+                # Add multiple fillers sometimes
+                num_fillers = random.randint(1, 2)
+                for _ in range(num_fillers):
+                    if len(words) > 4:
+                        insert_pos = random.randint(2, len(words)-2)
+                        filler = random.choice(self.filler_phrases)
+                        words.insert(insert_pos, filler)
+                sent = ' '.join(words)
+            # Add varied sentence starters (15% chance)
+            if i > 0 and random.random() < 0.15 and len(sent.split()) > 10:
+                starter = random.choice(self.varied_starters)
+                sent = starter + " " + sent[0].lower() + sent[1:] if len(sent) > 1 else sent
+            # Occasionally use contractions (35% chance - increased)
+            if random.random() < 0.35:
+                sent = self.apply_contractions(sent)
+            # Add occasional comma splices (10% chance) - common human error
+            if random.random() < 0.1 and ',' in sent and len(sent.split()) > 10:
+                # Replace a period with comma sometimes
+                parts = sent.split(', ')
+                if len(parts) > 2:
+                    join_idx = random.randint(1, len(parts)-1)
+                    parts[join_idx-1] = parts[join_idx-1] + ','
+                    sent = ' '.join(parts)
+            # NEW: Add parenthetical thoughts (8% chance)
+            if random.random() < 0.08 and len(sent.split()) > 15:
+                parentheticals = [
+                    "(and that's saying something)",
+                    "(which is pretty interesting)",
+                    "(trust me on this one)",
+                    "(I've seen this firsthand)",
+                    "(no joke)",
+                    "(seriously)",
+                    "(and for good reason)",
+                    "(believe it or not)",
+                    "(surprisingly enough)",
+                    "(which makes sense)",
+                    "(go figure)",
+                    "(who knew?)",
+                    "(makes you think)",
+                    "(worth considering)"
+                ]
+                words = sent.split()
+                insert_pos = random.randint(len(words)//3, 2*len(words)//3)
+                parenthetical = random.choice(parentheticals)
+                words.insert(insert_pos, parenthetical)
+                sent = ' '.join(words)
+            # NEW: Occasionally add rhetorical questions (5% chance)
+            if random.random() < 0.05 and i < len(sentences) - 1:
+                rhetorical_questions = [
+                    "Makes sense, right?",
+                    "Pretty cool, huh?",
+                    "Interesting, isn't it?",
+                    "Who would've thought?",
+                    "Sound familiar?",
+                    "See what I mean?",
+                    "Get the picture?",
+                    "Following along?",
+                    "Crazy, right?",
+                    "Wild, isn't it?"
+                ]
+                sent = sent + " " + random.choice(rhetorical_questions)
+            modified_sentences.append(sent)
+        return '. '.join(modified_sentences)
+    def apply_contractions(self, text):
+        """Apply common contractions - EXPANDED"""
+        contractions = {
+            "it is": "it's", "that is": "that's", "there is": "there's",
+            "he is": "he's", "she is": "she's", "what is": "what's",
+            "where is": "where's", "who is": "who's", "how is": "how's",
+            "cannot": "can't", "will not": "won't", "do not": "don't",
+            "does not": "doesn't", "did not": "didn't", "could not": "couldn't",
+            "should not": "shouldn't", "would not": "wouldn't", "is not": "isn't",
+            "are not": "aren't", "was not": "wasn't", "were not": "weren't",
+            "have not": "haven't", "has not": "hasn't", "had not": "hadn't",
+            "I am": "I'm", "you are": "you're", "we are": "we're",
+            "they are": "they're", "I have": "I've", "you have": "you've",
+            "we have": "we've", "they have": "they've", "I will": "I'll",
+            "you will": "you'll", "he will": "he'll", "she will": "she'll",
+            "we will": "we'll", "they will": "they'll", "I would": "I'd",
+            "you would": "you'd", "he would": "he'd", "she would": "she'd",
+            "we would": "we'd", "they would": "they'd", "could have": "could've",
+            "should have": "should've", "would have": "would've", "might have": "might've",
+            "must have": "must've", "there has": "there's", "here is": "here's",
+            "let us": "let's", "that will": "that'll", "who will": "who'll"
+        }
+        for full, contr in contractions.items():
+            if random.random() < 0.8:  # 80% chance to apply each contraction
+                text = re.sub(r'\b' + full + r'\b', contr, text, flags=re.IGNORECASE)
+        return text
+    def add_minor_errors(self, text):
+        """Add very minor, human-like errors - MORE REALISTIC"""
+        # Occasionally miss Oxford comma (15% chance)
+        if random.random() < 0.15:
+            text = re.sub(r'(\w+), (\w+), and', r'\1, \2 and', text)
+        # Sometimes use 'which' instead of 'that' (8% chance)
+        if random.random() < 0.08:
+            text = text.replace(' that ', ' which ', 1)
+        # NEW: Add very occasional typos (3% chance per sentence)
+        sentences = text.split('. ')
+        for i, sent in enumerate(sentences):
+            if random.random() < 0.03 and len(sent.split()) > 10:
+                words = sent.split()
+                # Pick a random word to potentially typo
+                word_idx = random.randint(0, len(words)-1)
+                word = words[word_idx].lower()
+                # Only typo common words
+                if word in self.common_typos and random.random() < 0.5:
+                    typo = random.choice(self.common_typos[word])
+                    # Preserve original capitalization
+                    if words[word_idx][0].isupper():
+                        typo = typo[0].upper() + typo[1:]
+                    words[word_idx] = typo
+                    sentences[i] = ' '.join(words)
+        text = '. '.join(sentences)
+        # NEW: Occasionally double a word (2% chance)
+        if random.random() < 0.02:
+            words = text.split()
+            if len(words) > 20:
+                # Pick a small common word to double
+                small_words = ['the', 'a', 'an', 'is', 'was', 'are', 'were', 'to', 'of', 'in', 'on']
+                for idx, word in enumerate(words):
+                    if word.lower() in small_words and random.random() < 0.1:
+                        words[idx] = word + ' ' + word
+                        break
+                text = ' '.join(words)
+        # NEW: Mix up common homophones occasionally (3% chance)
+        if random.random() < 0.03:
+            homophones = [
+                ('their', 'there'), ('your', 'you\'re'), ('its', 'it\'s'),
+                ('then', 'than'), ('to', 'too'), ('effect', 'affect')
+            ]
+            for pair in homophones:
+                if pair[0] in text and random.random() < 0.3:
+                    text = text.replace(pair[0], pair[1], 1)
+                    break
+        return text
+    def add_originality_specific_patterns(self, text):
+        """Add patterns that Originality AI associates with human writing"""
+        # 1. Add personal touches and opinions
+        if random.random() < 0.1:
+            personal_phrases = [
+                "In my view, ", "From my perspective, ", "I believe ",
+                "It seems to me that ", "I've found that ", "In my experience, ",
+                "I tend to think ", "My take is that ", "I'd argue that ",
+                "Personally, I think ", "If you ask me, ", "The way I see it, "
+            ]
+            sentences = text.split('. ')
+            if len(sentences) > 3:
+                idx = random.randint(1, len(sentences)-2)
+                sentences[idx] = random.choice(personal_phrases) + sentences[idx][0].lower() + sentences[idx][1:]
+                text = '. '.join(sentences)
+        # 2. Add conversational asides
+        if random.random() < 0.08:
+            asides = [
+                " - and this is important - ",
+                " - bear with me here - ",
+                " - stay with me - ",
+                " - and I mean this - ",
+                " - no exaggeration - ",
+                " - true story - ",
+                " - I'm serious - ",
+                " - think about it - ",
+                " - and here's why - "
+            ]
+            words = text.split()
+            if len(words) > 20:
+                pos = random.randint(10, len(words)-10)
+                words.insert(pos, random.choice(asides))
+                text = ' '.join(words)
+        # 3. Add emphatic repetition (human pattern)
+        if random.random() < 0.05:
+            emphatic_words = ['very', 'really', 'truly', 'absolutely', 'totally']
+            sentences = text.split('. ')
+            if sentences:
+                sent_idx = random.randint(0, len(sentences)-1)
+                words = sentences[sent_idx].split()
+                if len(words) > 5:
+                    # Find an adjective or adverb to emphasize
+                    for i, word in enumerate(words):
+                        if i > 0 and i < len(words)-1:
+                            # Add emphasis
+                            if random.random() < 0.3:
+                                emphasis = random.choice(emphatic_words)
+                                words.insert(i, emphasis)
+                                # Sometimes repeat for extra emphasis
+                                if random.random() < 0.3:
+                                    words.insert(i, emphasis + ',')
+                                break
+                    sentences[sent_idx] = ' '.join(words)
+                    text = '. '.join(sentences)
+        return text
+class SelectiveGrammarFixer:
+    """Minimal grammar fixes to maintain human-like quality while fixing critical errors"""
+    def __init__(self):
+        self.nlp = None
+        self.human_variations = HumanLikeVariations()
+    def fix_incomplete_sentences_only(self, text):
+        """Fix only incomplete sentences without over-correcting"""
+        if not text:
+            return text
+        sentences = text.split('. ')
+        fixed_sentences = []
+        for i, sent in enumerate(sentences):
+            sent = sent.strip()
+            if not sent:
+                continue
+            # Only fix if sentence is incomplete
+            if sent and sent[-1] not in '.!?':
+                # Check if it's the last sentence
+                if i == len(sentences) - 1:
+                    # Add period if it's clearly a statement
+                    if not sent.endswith(':') and not sent.endswith(','):
+                        sent += '.'
+                else:
+                    # Middle sentences should have periods
+                    sent += '.'
+            # Fix cut-off words (very short last word without punctuation)
+            words = sent.split()
+            if len(words) > 3:
+                last_word = words[-1].rstrip('.!?')
+                if len(last_word) <= 2 and last_word.isalpha():
+                    # Check if it has vowels (real word vs cut-off)
+                    if not any(c in 'aeiouAEIOU' for c in last_word):
+                        # Likely a cut-off word, remove it
+                        words = words[:-1]
+                        sent = ' '.join(words)
+                        if sent and sent[-1] not in '.!?':
+                            sent += '.'
+            # Ensure first letter capitalization ONLY after sentence endings
+            if i > 0 and sent and sent[0].islower():
+                # Check if previous sentence ended with punctuation
+                if fixed_sentences and fixed_sentences[-1].rstrip().endswith(('.', '!', '?')):
+                    sent = sent[0].upper() + sent[1:]
+            elif i == 0 and sent and sent[0].islower():
+                # First sentence should be capitalized
+                sent = sent[0].upper() + sent[1:]
+            fixed_sentences.append(sent)
+        result = ' '.join(fixed_sentences)
+        # Add human-like variations
+        result = self.human_variations.add_human_touch(result)
+        result = self.human_variations.add_minor_errors(result)
+        result = self.human_variations.add_originality_specific_patterns(result)
+        return result
+    def fix_basic_punctuation_errors(self, text):
+        """Fix only the most egregious punctuation errors"""
+        if not text:
+            return text
+        # Fix double spaces (human-like error)
+        text = re.sub(r'\s{2,}', ' ', text)
+        # Fix space before punctuation (common error)
+        text = re.sub(r'\s+([.,!?;:])', r'\1', text)
+        # Fix missing space after punctuation (human-like)
+        text = re.sub(r'([.,!?])([A-Z])', r'\1 \2', text)
+        # Fix accidental double punctuation
+        text = re.sub(r'([.!?])\1+', r'\1', text)
+        # Fix "i" capitalization (common human error to fix)
+        text = re.sub(r'\bi\b', 'I', text)
+        return text
+    def preserve_natural_variations(self, text):
+        """Keep some natural human-like variations"""
+        # Don't fix everything - leave some variety
+        # Only fix if really broken
+        if text.count('.') == 0 and len(text.split()) > 20:
+            # Long text with no periods - needs fixing
+            words = text.split()
+            # Add periods every 15-25 words naturally (more variation)
+            new_text = []
+            for i, word in enumerate(words):
+                new_text.append(word)
+                if i > 0 and i % random.randint(12, 25) == 0:
+                    if word[-1] not in '.!?,;:':
+                        new_text[-1] = word + '.'
+                        # Capitalize next word if it's not an acronym
+                        if i + 1 < len(words) and words[i + 1][0].islower():
+                            # Check if it's not likely an acronym
+                            if not words[i + 1].isupper():
+                                words[i + 1] = words[i + 1][0].upper() + words[i + 1][1:]
+            text = ' '.join(new_text)
+        return text
+    def smart_fix(self, text):
+        """Apply minimal fixes to maintain human-like quality"""
+        # Apply fixes in order of importance
+        text = self.fix_basic_punctuation_errors(text)
+        text = self.fix_incomplete_sentences_only(text)
+        text = self.preserve_natural_variations(text)
+        return text
+class EnhancedDipperHumanizer:
+    def __init__(self):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"Using device: {self.device}")
+        # Clear GPU cache
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        # Initialize grammar fixer
+        self.grammar_fixer = SelectiveGrammarFixer()
+        # Try to load spaCy if available
+        self.nlp = None
+        self.use_spacy = False
+        if SPACY_AVAILABLE:
+            try:
+                self.nlp = spacy.load("en_core_web_sm")
+                self.use_spacy = True
+                print("spaCy loaded successfully")
+            except:
+                print("spaCy model not found, using NLTK for sentence splitting")
+        try:
+            # Load Dipper paraphraser WITHOUT 8-bit quantization for better performance
+            print("Loading Dipper paraphraser model...")
+            self.tokenizer = T5Tokenizer.from_pretrained('google/t5-v1_1-xxl')
+            self.model = T5ForConditionalGeneration.from_pretrained(
+                "kalpeshk2011/dipper-paraphraser-xxl",
+                device_map="auto",  # This will distribute across 4xL40S automatically
+                torch_dtype=torch.float16,
+                low_cpu_mem_usage=True
+            )
+            print("Dipper model loaded successfully!")
+            self.is_dipper = True
+        except Exception as e:
+            print(f"Error loading Dipper model: {str(e)}")
+            print("Falling back to Flan-T5-XL...")
+            self.is_dipper = False
+            # Fallback to Flan-T5-XL
+            try:
+                self.model = T5ForConditionalGeneration.from_pretrained(
+                    "google/flan-t5-xl",
+                    torch_dtype=torch.float16,
+                    low_cpu_mem_usage=True,
+                    device_map="auto"
+                )
+                self.tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
+                print("Loaded Flan-T5-XL as fallback")
+            except:
+                raise Exception("Could not load any model. Please check your system resources.")
+        # Load BART as secondary model
+        try:
+            print("Loading BART model for additional variation...")
+            self.bart_model = AutoModelForSeq2SeqLM.from_pretrained(
+                "eugenesiow/bart-paraphrase",
+                torch_dtype=torch.float16,
+                device_map="auto"  # Distribute across GPUs
+            )
+            self.bart_tokenizer = AutoTokenizer.from_pretrained("eugenesiow/bart-paraphrase")
+            self.use_bart = True
+            print("BART model loaded successfully")
+        except:
+            print("BART model not available")
+            self.use_bart = False
+    def preserve_keywords(self, text, keywords):
+        """Mark keywords to preserve them during paraphrasing"""
+        if not keywords:
+            return text, {}
+        # Create a mapping of placeholders to keywords
+        keyword_map = {}
+        modified_text = text
+        # Sort keywords by length (longest first) to avoid partial replacements
+        sorted_keywords = sorted(keywords, key=len, reverse=True)
+        for i, keyword in enumerate(sorted_keywords):
+            # Use unique markers that won't be confused
+            placeholder = f"__KW{i:03d}__"  # e.g., __KW001__
+            # Find all occurrences of the keyword (case-insensitive)
+            pattern = r'\b' + re.escape(keyword) + r'\b'
+            matches = list(re.finditer(pattern, modified_text, flags=re.IGNORECASE))
+            if matches:
+                # Replace all occurrences with the placeholder
+                for match in reversed(matches):  # Reverse to maintain positions
+                    original_keyword = match.group(0)
+                    start, end = match.span()
+                    modified_text = modified_text[:start] + placeholder + modified_text[end:]
+                    # Store the original case version
+                    keyword_map[placeholder] = original_keyword
+        return modified_text, keyword_map
+    def restore_keywords_robust(self, text, keyword_map):
+        """Restore keywords with more flexible pattern matching"""
+        if not keyword_map:
+            return text
+        restored_text = text
+        # Debug: print what we're working with
+        print(f"Restoring keywords in text: {restored_text[:100]}...")
+        print(f"Keyword map: {keyword_map}")
+        # First pass: Direct placeholder replacement
+        for placeholder, keyword in keyword_map.items():
+            if placeholder in restored_text:
+                print(f"Found exact placeholder {placeholder}, replacing with {keyword}")
+                restored_text = restored_text.replace(placeholder, keyword)
+        # Second pass: Handle any mangled placeholders
+        # The model might alter placeholders in various ways
+        for placeholder, keyword in keyword_map.items():
+            # Extract the number from placeholder
+            match = re.search(r'__KW(\d+)__', placeholder)
+            if match:
+                num = match.group(1)
+                # Various patterns the model might create
+                patterns = [
+                    f'__KW{num}__',
+                    f'__ KW{num}__',
+                    f'__KW {num}__',
+                    f'__ KW {num} __',
+                    f'_KW{num}_',
+                    f'_kw{num}_',  # lowercase with single underscore
+                    f'KW{num}',
+                    f'KW {num}',
+                    f'__kw{num}__',  # lowercase variant
+                    f'__Kw{num}__',  # mixed case
+                    f'__ kw{num}__',
+                    f'__KW{num}_',   # missing underscore
+                    f'_KW{num}__',   # missing underscore
+                    f'kw{num}',      # just lowercase
+                    f'___',          # Sometimes model reduces to just underscores
+                    f'____',         # Various underscore patterns
+                    f'_____',
+                    f'__ __',
+                    f'___ ___',
+                ]
+                for pattern in patterns:
+                    if pattern in restored_text:
+                        print(f"Found pattern '{pattern}', replacing with {keyword}")
+                        restored_text = restored_text.replace(pattern, keyword)
+        # Third pass: Use regex to catch any remaining variations
+        # This catches cases where the model might have added characters
+        for placeholder, keyword in keyword_map.items():
+            match = re.search(r'__KW(\d+)__', placeholder)
+            if match:
+                num = match.group(1)
+                # Regex to match various mangled versions including single underscore
+                regex_patterns = [
+                    rf'_+\s*[Kk][Ww]\s*{num}\s*_*',  # Any underscores, case insensitive
+                    rf'[Kk][Ww]\s*{num}(?!\d)',       # KW followed by the number
+                    rf'__?\s*[Kk][Ww]\s*{num}\s*__?', # Optional underscores
+                    rf'_[Kk][Ww]{num}_',              # Single underscore version
+                    rf'_+\s*{num}\s*_*',              # Just the number with underscores
+                    rf'__+',                          # Multiple underscores (fallback)
+                ]
+                for pattern in regex_patterns:
+                    matches = list(re.finditer(pattern, restored_text, flags=re.IGNORECASE))
+                    if matches:
+                        print(f"Found regex pattern '{pattern}' {len(matches)} times")
+                        # Replace from end to beginning to maintain positions
+                        for match in reversed(matches):
+                            restored_text = restored_text[:match.start()] + keyword + restored_text[match.end():]
+        # Fourth pass: Look for common patterns where model mangles placeholders
+        # Sometimes the model turns __KW002__ into things like "___ University" or "___ College__"
+        underscore_patterns = [
+            (r'___+\s*[Uu]niversity', keyword + ' University') if 'universit' in keyword.lower() else None,
+            (r'___+\s*[Cc]ollege__?', keyword + ' College') if 'college' in keyword.lower() else None,
+            (r'___+\s*[Ss]chool', keyword + ' School') if 'school' in keyword.lower() else None,
+            (r'___+', keyword),  # Generic underscore replacement
+        ]
+        for pattern_tuple in underscore_patterns:
+            if pattern_tuple:
+                pattern, replacement = pattern_tuple
+                if re.search(pattern, restored_text):
+                    print(f"Found underscore pattern '{pattern}', replacing with {replacement}")
+                    restored_text = re.sub(pattern, replacement, restored_text)
+        # Final safety check: Look for any remaining placeholder-like patterns
+        remaining_underscores = re.findall(r'_{2,}', restored_text)
+        if remaining_underscores:
+            print(f"Warning: Found remaining underscore patterns: {remaining_underscores}")
+            # If we still have multiple underscores and we have keywords, do a simple replacement
+            # This is aggressive but necessary when model completely mangles placeholders
+            if '___' in restored_text and keyword_map:
+                # Replace the first occurrence of multiple underscores with each keyword
+                for placeholder, keyword in keyword_map.items():
+                    if '___' in restored_text:
+                        restored_text = restored_text.replace('___', keyword, 1)
+        # Log final result
+        print(f"Final restored text: {restored_text[:100]}...")
+        return restored_text
+    def should_skip_element(self, element, text):
+        """Determine if an element should be skipped from paraphrasing"""
+        if not text or len(text.strip()) < 3:
+            return True
+        # Skip JavaScript code inside script tags
+        parent = element.parent
+        if parent and parent.name in ['script', 'style', 'noscript']:
+            return True
+        # Skip headings (h1-h6)
+        if parent and parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
+            return True
+        # Skip content inside <strong> and <b> tags
+        if parent and parent.name in ['strong', 'b']:
+            return True
+        # Skip table content
+        if parent and (parent.name in ['td', 'th'] or any(p.name == 'table' for p in parent.parents)):
+            return True
+        # Special handling for content inside tables
+        # Skip if it's inside strong/b/h1-h6 tags AND also inside a table
+        if parent:
+            # Check if we're inside a table
+            is_in_table = any(p.name == 'table' for p in parent.parents)
+            if is_in_table:
+                # If we're in a table, skip any text that's inside formatting tags
+                if parent.name in ['strong', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'em', 'i']:
+                    return True
+                # Also check if parent's parent is a formatting tag
+                if parent.parent and parent.parent.name in ['strong', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+                    return True
+        # Skip table of contents
+        if parent:
+            parent_text = str(parent).lower()
+            if any(toc in parent_text for toc in ['table of contents', 'toc-', 'contents']):
+                return True
+        # Skip CTAs and buttons
+        if parent and parent.name in ['button', 'a']:
+            return True
+        # Skip if parent has onclick or other event handlers
+        if parent and parent.attrs:
+            event_handlers = ['onclick', 'onchange', 'onsubmit', 'onload', 'onmouseover', 'onmouseout']
+            if any(handler in parent.attrs for handler in event_handlers):
+                return True
+        # Special check for testimonial cards - check up to 3 levels of ancestors
+        if parent:
+            ancestors_to_check = []
+            current = parent
+            for _ in range(3):  # Check up to 3 levels up
+                if current:
+                    ancestors_to_check.append(current)
+                    current = current.parent
+            # Check if any ancestor has testimonial-card class
+            for ancestor in ancestors_to_check:
+                if ancestor and ancestor.get('class'):
+                    classes = ancestor.get('class', [])
+                    if isinstance(classes, list):
+                        if any('testimonial-card' in str(cls) for cls in classes):
+                            return True
+                    elif isinstance(classes, str) and 'testimonial-card' in classes:
+                        return True
+        # Skip if IMMEDIATE parent or element itself has skip-worthy classes/IDs
+        skip_indicators = [
+            'cta-', 'button', 'btn', 'heading', 'title', 'caption',
+            'toc-', 'contents', 'quiz', 'tip', 'note', 'alert',
+            'warning', 'info', 'success', 'error', 'code', 'pre',
+            'stats-grid', 'testimonial-card', 'highlight-box',
+            'cta-box', 'quiz-container', 'news-box', 'contact-form',
+            'faq-question', 'sidebar', 'widget', 'banner', 'news-section',
+            'author-intro', 'testimonial', 'review', 'feedback',
+            'floating-', 'stat-', 'progress-', 'option', 'results',
+            'question-container', 'quiz-', 'faq-',
+            'comparision-tables', 'process-flowcharts', 'infographics', 'cost-breakdown'
+        ]
+        # Check only immediate parent and grandparent (not all ancestors)
+        elements_to_check = [parent]
+        if parent and parent.parent:
+            elements_to_check.append(parent.parent)
+        for elem in elements_to_check:
+            if not elem:
+                continue
+            # Check element's class
+            elem_class = elem.get('class', [])
+            if isinstance(elem_class, list):
+                class_str = ' '.join(str(cls).lower() for cls in elem_class)
+                if any(indicator in class_str for indicator in skip_indicators):
+                    return True
+            # Check element's ID
+            elem_id = elem.get('id', '')
+            if any(indicator in str(elem_id).lower() for indicator in skip_indicators):
+                return True
+        # Skip short phrases that might be UI elements
+        word_count = len(text.split())
+        if word_count <= 5:
+            ui_patterns = [
+                'click', 'download', 'learn more', 'read more', 'sign up',
+                'get started', 'try now', 'buy now', 'next', 'previous',
+                'back', 'continue', 'submit', 'cancel', 'get now', 'book your',
+                'check out:', 'see also:', 'related:', 'question', 'of'
+            ]
+            if any(pattern in text.lower() for pattern in ui_patterns):
+                return True
+        # Skip very short content in styled containers
+        if parent and parent.name in ['div', 'section', 'aside', 'blockquote']:
+            style = parent.get('style', '')
+            if 'border' in style or 'background' in style:
+                if word_count <= 20:
+                    # But don't skip if it's inside a paragraph
+                    if not any(p.name == 'p' for p in parent.parents):
+                        return True
+        return False
+    def is_likely_acronym_or_proper_noun(self, word):
+        """Check if a word is likely an acronym or part of a proper noun"""
+        # Common acronyms and abbreviations
+        acronyms = {'MBA', 'CEO', 'USA', 'UK', 'GMAT', 'GRE', 'SAT', 'ACT', 'PhD', 'MD', 'IT', 'AI', 'ML'}
+        # Check if it's in our acronym list
+        if word.upper() in acronyms:
+            return True
+        # Check if it's all caps (likely acronym)
+        if word.isupper() and len(word) > 1:
+            return True
+        # Check if it follows patterns like "Edition", "Focus", etc. that often come after proper nouns
+        proper_noun_continuations = {
+            'Edition', 'Version', 'Series', 'Focus', 'System', 'Method', 'School',
+            'University', 'College', 'Institute', 'Academy', 'Center', 'Centre'
+        }
+        if word in proper_noun_continuations:
+            return True
+        return False
+    def clean_model_output_enhanced(self, text):
+        """Enhanced cleaning that preserves more natural structure"""
+        if not text:
+            return ""
+        # Store original for fallback
+        original = text
+        # Remove ONLY clear model artifacts
+        text = re.sub(r'^lexical\s*=\s*\d+\s*,\s*order\s*=\s*\d+\s*', '', text, flags=re.IGNORECASE)
+        text = re.sub(r'<sent>\s*', '', text, flags=re.IGNORECASE)
+        text = re.sub(r'\s*</sent>', '', text, flags=re.IGNORECASE)
+        # Only remove clear prefixes
+        if text.lower().startswith('paraphrase:'):
+            text = text[11:].strip()
+        elif text.lower().startswith('rewrite:'):
+            text = text[8:].strip()
+        # Remove leading non-letter characters carefully
+        # IMPORTANT: Preserve keyword placeholders
+        if not re.match(r'^__KW\d+__', text):
+            # Only remove if it doesn't start with a placeholder
+            text = re.sub(r'^[^a-zA-Z_]+', '', text)
+        # If we accidentally removed too much, use original
+        if len(text) < len(original) * 0.5:
+            text = original
+        return text.strip()
+    def paraphrase_with_dipper(self, text, lex_diversity=60, order_diversity=20, keywords=None):
+        """Paraphrase text using Dipper model with sentence-level processing"""
+        if not text or len(text.strip()) < 3:
+            return text
+        # Preserve keywords
+        text_with_placeholders, keyword_map = self.preserve_keywords(text, keywords)
+        # Add debug logging
+        if keyword_map:
+            print(f"Debug: Created keyword map: {keyword_map}")
+            print(f"Debug: Text with placeholders: {text_with_placeholders[:100]}...")
+        # Split into sentences for better control
+        sentences = self.split_into_sentences_advanced(text_with_placeholders)
+        paraphrased_sentences = []
+        for sentence in sentences:
+            if len(sentence.strip()) < 3:
+                paraphrased_sentences.append(sentence)
+                continue
+            try:
+                # Adjust diversity based on presence of keywords
+                has_keywords = any(placeholder in sentence for placeholder in keyword_map.keys())
+                if has_keywords:
+                    # Use MODERATE diversity when keywords are present to avoid mangling
+                    lex_diversity = 40  # Reduced from 70
+                    order_diversity = 10  # Reduced from 20
+                elif len(sentence.split()) < 10:
+                    lex_diversity = 70  # Reduced from 80
+                    order_diversity = 25  # Reduced from 30
+                else:
+                    lex_diversity = 85  # Slightly reduced from 90
+                    order_diversity = 35  # Slightly reduced from 40
+                lex_code = int(100 - lex_diversity)
+                order_code = int(100 - order_diversity)
+                # Format input for Dipper
+                if self.is_dipper:
+                    input_text = f"lexical = {lex_code}, order = {order_code} <sent> {sentence} </sent>"
+                else:
+                    input_text = f"paraphrase: {sentence}"
+                # Tokenize
+                inputs = self.tokenizer(
+                    input_text,
+                    return_tensors="pt",
+                    max_length=512,
+                    truncation=True,
+                    padding=True
+                )
+                # Move to device
+                if hasattr(self.model, 'device_map') and self.model.device_map:
+                    device = next(iter(self.model.device_map.values()))
+                    inputs = {k: v.to(device) for k, v in inputs.items()}
+                else:
+                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                # Generate with appropriate variation based on keywords
+                original_length = len(sentence.split())
+                max_new_length = int(original_length * 1.3)  # Reduced from 1.4
+                # Adjust temperature based on keywords
+                temp = 0.9 if has_keywords else 1.1  # Lower temp for keywords
+                top_p_val = 0.95 if has_keywords else 0.9
+                with torch.no_grad():
+                    outputs = self.model.generate(
+                        **inputs,
+                        max_length=max_new_length + 20,
+                        min_length=max(5, int(original_length * 0.7)),
+                        do_sample=True,
+                        top_p=top_p_val,
+                        temperature=temp,
+                        no_repeat_ngram_size=3,
+                        num_beams=3 if has_keywords else 2,  # More beams for stability with keywords
+                        early_stopping=True
+                    )
+                # Decode
+                paraphrased = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+                # Clean model artifacts
+                paraphrased = self.clean_model_output_enhanced(paraphrased)
+                # Fix incomplete sentences
+                paraphrased = self.fix_incomplete_sentence_smart(paraphrased, sentence)
+                # Ensure reasonable length
+                if len(paraphrased.split()) > max_new_length:
+                    paraphrased = ' '.join(paraphrased.split()[:max_new_length])
+                paraphrased_sentences.append(paraphrased)
+            except Exception as e:
+                print(f"Error paraphrasing sentence: {str(e)}")
+                paraphrased_sentences.append(sentence)
+        # Join sentences back
+        result = ' '.join(paraphrased_sentences)
+        # Debug before restoration
+        if keyword_map:
+            print(f"Debug: Result before restoration: {result[:100]}...")
+            print(f"Debug: Checking for placeholders...")
+            for placeholder in keyword_map.keys():
+                if placeholder in result:
+                    print(f"Debug: Found placeholder {placeholder} in result")
+                else:
+                    # Check for mangled versions
+                    if '___' in result:
+                        print(f"Debug: Found underscores ___ instead of {placeholder}")
+        # Restore keywords AFTER joining all sentences
+        result = self.restore_keywords_robust(result, keyword_map)
+        # Debug after restoration
+        if keyword_map:
+            print(f"Debug: Result after restoration: {result[:100]}...")
+        # Apply minimal grammar fixes with human variations
+        result = self.grammar_fixer.smart_fix(result)
+        return result
+    def fix_incomplete_sentence_smart(self, generated, original):
+        """Smarter sentence completion that maintains natural flow"""
+        if not generated or not generated.strip():
+            return original
+        generated = generated.strip()
+        # Check if the sentence seems complete semantically
+        words = generated.split()
+        if len(words) >= 3:
+            # Check if last word is a good ending word
+            last_word = words[-1].lower().rstrip('.,!?;:')
+            # Common ending words that might not need punctuation fix
+            ending_words = {
+                'too', 'also', 'well', 'though', 'however',
+                'furthermore', 'moreover', 'indeed', 'anyway',
+                'regardless', 'nonetheless', 'therefore', 'thus'
+            }
+            # If it ends with a good word, just add appropriate punctuation
+            if last_word in ending_words:
+                if generated[-1] not in '.!?':
+                    generated += '.'
+                return generated
+        # Check for cut-off patterns
+        if len(words) > 0:
+            last_word = words[-1]
+            # Remove if it's clearly cut off (1-2 chars, no vowels)
+            # But don't remove valid short words like "is", "of", "to", etc.
+            short_valid_words = {'is', 'of', 'to', 'in', 'on', 'at', 'by', 'or', 'if', 'so', 'up', 'no', 'we', 'he', 'me', 'be', 'do', 'go'}
+            if (len(last_word) <= 2 and
+                last_word.lower() not in short_valid_words and
+                not any(c in 'aeiouAEIOU' for c in last_word)):
+                words = words[:-1]
+                generated = ' '.join(words)
+        # Add ending punctuation based on context
+        if generated and generated[-1] not in '.!?:,;':
+            # Check original ending
+            orig_stripped = original.strip()
+            if orig_stripped.endswith('?'):
+                # Check if generated seems like a question
+                question_words = ['what', 'why', 'how', 'when', 'where', 'who', 'which', 'is', 'are', 'do', 'does', 'can', 'could', 'would', 'should']
+                first_word = generated.split()[0].lower() if generated.split() else ''
+                if first_word in question_words:
+                    generated += '?'
+                else:
+                    generated += '.'
+            elif orig_stripped.endswith('!'):
+                # Check if generated seems exclamatory
+                exclaim_words = ['amazing', 'incredible', 'fantastic', 'terrible', 'awful', 'wonderful', 'excellent']
+                if any(word in generated.lower() for word in exclaim_words):
+                    generated += '!'
+                else:
+                    generated += '.'
+            elif orig_stripped.endswith(':'):
+                generated += ':'
+            else:
+                generated += '.'
+        # Ensure first letter is capitalized ONLY if it's sentence start
+        # Don't capitalize words like "iPhone" or "eBay" or placeholders
+        if generated and generated[0].islower() and not self.is_likely_acronym_or_proper_noun(generated.split()[0]) and not generated.startswith('__KW'):
+            generated = generated[0].upper() + generated[1:]
+        return generated
+    def split_into_sentences_advanced(self, text):
+        """Advanced sentence splitting using spaCy or NLTK"""
+        if self.use_spacy and self.nlp:
+            doc = self.nlp(text)
+            sentences = [sent.text.strip() for sent in doc.sents]
+        else:
+            # Fallback to NLTK
+            try:
+                sentences = sent_tokenize(text)
+            except:
+                # Final fallback to regex
+                sentences = re.split(r'(?<=[.!?])\s+', text)
+        # Clean up sentences
+        return [s for s in sentences if s and len(s.strip()) > 0]
+    def paraphrase_with_bart(self, text, keywords=None):
+        """Additional paraphrasing with BART for more variation"""
+        if not self.use_bart or not text or len(text.strip()) < 3:
+            return text
+        try:
+            # Preserve keywords
+            text_with_placeholders, keyword_map = self.preserve_keywords(text, keywords)
+            # Process in smaller chunks for BART
+            sentences = self.split_into_sentences_advanced(text_with_placeholders)
+            paraphrased_sentences = []
+            for sentence in sentences:
+                if len(sentence.split()) < 5:
+                    paraphrased_sentences.append(sentence)
+                    continue
+                inputs = self.bart_tokenizer(
+                    sentence,
+                    return_tensors='pt',
+                    max_length=128,
+                    truncation=True
+                )
+                # Move to appropriate device
+                if hasattr(self.bart_model, 'device_map') and self.bart_model.device_map:
+                    device = next(iter(self.bart_model.device_map.values()))
+                    inputs = {k: v.to(device) for k, v in inputs.items()}
+                else:
+                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                original_length = len(sentence.split())
+                with torch.no_grad():
+                    outputs = self.bart_model.generate(
+                        **inputs,
+                        max_length=int(original_length * 1.4) + 10,
+                        min_length=max(5, int(original_length * 0.6)),
+                        num_beams=2,
+                        temperature=1.1,  # Higher temperature
+                        do_sample=True,
+                        top_p=0.9,
+                        early_stopping=True
+                    )
+                paraphrased = self.bart_tokenizer.decode(outputs[0], skip_special_tokens=True)
+                # Fix incomplete sentences
+                paraphrased = self.fix_incomplete_sentence_smart(paraphrased, sentence)
+                paraphrased_sentences.append(paraphrased)
+            result = ' '.join(paraphrased_sentences)
+            # Restore keywords AFTER joining all sentences
+            result = self.restore_keywords_robust(result, keyword_map)
+            # Apply minimal grammar fixes
+            result = self.grammar_fixer.smart_fix(result)
+            return result
+        except Exception as e:
+            print(f"Error in BART paraphrasing: {str(e)}")
+            return text
+    def apply_sentence_variation(self, text):
+        """Apply natural sentence structure variations - MORE AGGRESSIVE"""
+        sentences = self.split_into_sentences_advanced(text)
+        varied_sentences = []
+        for i, sentence in enumerate(sentences):
+            # Skip empty sentences
+            if not sentence.strip():
+                continue
+            # MORE aggressive variations
+            # Combine short sentences more often (50% chance)
+            if (i < len(sentences) - 1 and
+                len(sentence.split()) < 15 and
+                len(sentences[i+1].split()) < 15 and
+                random.random() < 0.5):
+                connectors = [', and', ', but', '; however,', '. Also,', '. Plus,', ', so', ', which means',
+                             ' - and', ' - but', '; meanwhile,', '. That said,', ', yet', ' - though']
+                connector = random.choice(connectors)
+                # Handle the next sentence properly
+                next_sent = sentences[i+1].strip()
+                if next_sent:
+                    combined = f"{sentence.rstrip('.')}{connector} {next_sent[0].lower()}{next_sent[1:]}"
+                    varied_sentences.append(combined)
+                    sentences[i+1] = ""  # Mark as processed
+            elif sentence:  # Only process non-empty sentences
+                # Split very long sentences more aggressively
+                if len(sentence.split()) > 18 and ',' in sentence:
+                    parts = sentence.split(', ', 1)
+                    if len(parts) == 2 and len(parts[1].split()) > 6:
+                        # 70% chance to split
+                        if random.random() < 0.7:
+                            varied_sentences.append(parts[0] + '.')
+                            # Ensure second part starts with capital
+                            if parts[1]:
+                                varied_sentences.append(parts[1][0].upper() + parts[1][1:])
+                        else:
+                            varied_sentences.append(sentence)
+                    else:
+                        varied_sentences.append(sentence)
+                else:
+                    # Add natural variations more often (35% chance)
+                    if i > 0 and random.random() < 0.35:
+                        # Sometimes add a transition
+                        transitions = ['Furthermore, ', 'Additionally, ', 'Moreover, ', 'Also, ',
+                                     'Besides, ', 'What\'s more, ', 'In addition, ', 'Not only that, ',
+                                     'To add to that, ', 'On top of that, ', 'Beyond that, ']
+                        transition = random.choice(transitions)
+                        if sentence[0].isupper():
+                            sentence = transition + sentence[0].lower() + sentence[1:]
+                    # Add mid-sentence interruptions (10% chance)
+                    if random.random() < 0.1 and len(sentence.split()) > 12:
+                        interruptions = [
+                            " - and this is crucial - ",
+                            " - believe me - ",
+                            " - no kidding - ",
+                            " (and yes, I mean it) ",
+                            " - stay with me here - ",
+                            " - and I'm not exaggerating - "
+                        ]
+                        words = sentence.split()
+                        pos = random.randint(len(words)//3, 2*len(words)//3)
+                        words.insert(pos, random.choice(interruptions))
+                        sentence = ' '.join(words)
+                    varied_sentences.append(sentence)
+        # Post-process for additional human patterns
+        result = ' '.join([s for s in varied_sentences if s])
+        # Add occasional fragments for human touch (5% chance)
+        if random.random() < 0.05:
+            fragments = [
+                "Crazy, I know.",
+                "Wild stuff.",
+                "Makes you think.",
+                "Pretty interesting.",
+                "Go figure.",
+                "Who knew?",
+                "There you have it.",
+                "Food for thought.",
+                "Just saying.",
+                "Worth considering."
+            ]
+            sentences = result.split('. ')
+            if len(sentences) > 3:
+                insert_pos = random.randint(1, len(sentences)-1)
+                sentences.insert(insert_pos, random.choice(fragments))
+                result = '. '.join(sentences)
+        return result
+    def fix_punctuation(self, text):
+        """Comprehensive punctuation and formatting fixes"""
+        if not text:
+            return ""
+        # First, clean any remaining model artifacts
+        text = self.clean_model_output_enhanced(text)
+        # Fix weird symbols and characters using safe replacements
+        text = text.replace('<>', '')  # Remove empty angle brackets
+        # Normalize quotes - use replace instead of regex for problematic characters
+        text = text.replace('«', '"').replace('»', '"')
+        text = text.replace('„', '"').replace('"', '"').replace('"', '"')
+        text = text.replace(''', "'").replace(''', "'")
+        text = text.replace('–', '-').replace('—', '-')
+        # Fix colon issues
+        text = re.sub(r'\.:', ':', text)  # Remove period before colon
+        text = re.sub(r':\s*\.', ':', text)  # Remove period after colon
+        # Fix basic spacing
+        text = re.sub(r'\s+', ' ', text)  # Multiple spaces to single
+        text = re.sub(r'\s+([.,!?;:])', r'\1', text)  # Remove space before punctuation
+        text = re.sub(r'([.,!?;:])\s*([.,!?;:])', r'\1', text)  # Remove double punctuation
+        text = re.sub(r'([.!?])\s*\1+', r'\1', text)  # Remove repeated punctuation
+        # Fix colons
+        text = re.sub(r':\s*([.,!?])', ':', text)  # Remove punctuation after colon
+        text = re.sub(r'([.,!?])\s*:', ':', text)  # Remove punctuation before colon
+        text = re.sub(r':+', ':', text)  # Multiple colons to one
+        # Fix quotes and parentheses
+        text = re.sub(r'"\s*([^"]*?)\s*"', r'"\1"', text)
+        text = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", text)
+        text = re.sub(r'\(\s*([^)]*?)\s*\)', r'(\1)', text)
+        # Fix sentence capitalization more carefully
+        # Split on ACTUAL sentence endings only
+        sentences = re.split(r'(?<=[.!?])\s+', text)
+        fixed_sentences = []
+        for i, sentence in enumerate(sentences):
+            if not sentence:
+                continue
+            # Only capitalize the first letter if it's actually lowercase
+            # and not part of a special case (like iPhone, eBay, etc.)
+            words = sentence.split()
+            if words:
+                first_word = words[0]
+                # Check if it's not an acronym or proper noun that should stay lowercase
+                if (first_word[0].islower() and
+                    not self.is_likely_acronym_or_proper_noun(first_word) and
+                    not first_word.startswith('__KW') and
+                    not first_word.startswith('_kw')):
+                    # Only capitalize if it's a regular word
+                    sentence = first_word[0].upper() + first_word[1:] + ' ' + ' '.join(words[1:])
+            fixed_sentences.append(sentence)
+        text = ' '.join(fixed_sentences)
+        # Fix common issues
+        text = re.sub(r'\bi\b', 'I', text)  # Capitalize 'I'
+        text = re.sub(r'\.{2,}', '.', text)  # Multiple periods to one
+        text = re.sub(r',{2,}', ',', text)  # Multiple commas to one
+        text = re.sub(r'\s*,\s*,\s*', ', ', text)  # Double commas with spaces
+        # Remove weird artifacts
+        text = re.sub(r'\b(CHAPTER\s+[IVX]+|SECTION\s+\d+)\b[^\w]*', '', text, flags=re.IGNORECASE)
+        # Fix abbreviations
+        text = re.sub(r'\betc\s*\.\s*\.', 'etc.', text)
+        text = re.sub(r'\be\.g\s*\.\s*[,\s]', 'e.g., ', text)
+        text = re.sub(r'\bi\.e\s*\.\s*[,\s]', 'i.e., ', text)
+        # Fix numbers with periods (like "1. " at start of lists)
+        text = re.sub(r'(\d+)\.\s+', r'\1. ', text)
+        # Fix bold/strong tags punctuation
+        text = self.fix_bold_punctuation(text)
+        # Clean up any remaining issues
+        text = re.sub(r'\s+([.,!?;:])', r'\1', text)  # Final space cleanup
+        text = re.sub(r'([.,!?;:])\s{2,}', r'\1 ', text)  # Fix multiple spaces after punctuation
+        # Ensure ending punctuation
+        text = text.strip()
+        if text and text[-1] not in '.!?':
+            # Don't add period if it ends with colon (likely a list header)
+            if not text.endswith(':'):
+                text += '.'
+        return text
+    def fix_bold_punctuation(self, text):
+        """Fix punctuation issues around bold/strong tags"""
+        # Check if this is likely a list item with colon pattern
+        def is_list_item_with_colon(text):
+            # Pattern: starts with or contains <strong>Text:</strong> or <b>Text:</b>
+            list_pattern = r'^\s*(?:[-•*▪▫◦‣⁃]\s*)?<(?:strong|b)>[^<]+:</(?:strong|b)>'
+            return bool(re.search(list_pattern, text))
+        # If it's a list item with colon, preserve the format
+        if is_list_item_with_colon(text):
+            # Just clean up spacing but preserve the colon inside bold
+            text = re.sub(r'<(strong|b)>\s*([^:]+)\s*:\s*</\1>', r'<\1>\2:</\1>', text)
+            return text
+        # Pattern to find bold/strong content
+        bold_pattern = r'<(strong|b)>(.*?)</\1>'
+        def fix_bold_match(match):
+            tag = match.group(1)
+            content = match.group(2).strip()
+            if not content:
+                return f'<{tag}></{tag}>'
+            # Check if this is a list header (contains colon at the end)
+            if content.endswith(':'):
+                # Preserve list headers with colons
+                return f'<{tag}>{content}</{tag}>'
+            # Remove any periods at the start or end of bold content
+            content = content.strip('.')
+            # Check if this bold text is at the start of a sentence
+            # (preceded by nothing, or by '. ', '! ', '? ')
+            start_pos = match.start()
+            is_sentence_start = (start_pos == 0 or
+                               (start_pos > 2 and text[start_pos-2:start_pos] in ['. ', '! ', '? ', '\n\n']))
+            # Capitalize first letter if it's at sentence start
+            if is_sentence_start and content and content[0].isalpha():
+                content = content[0].upper() + content[1:]
+            return f'<{tag}>{content}</{tag}>'
+        # Fix bold/strong tags
+        text = re.sub(bold_pattern, fix_bold_match, text)
+        # Fix spacing around bold/strong tags (but not for list items)
+        if not is_list_item_with_colon(text):
+            text = re.sub(r'\.\s*<(strong|b)>', r'. <\1>', text)  # Period before bold
+            text = re.sub(r'</(strong|b)>\s*\.', r'</\1>.', text)  # Period after bold
+            text = re.sub(r'([.!?])\s*<(strong|b)>', r'\1 <\2>', text)  # Space after sentence end
+            text = re.sub(r'</(strong|b)>\s+([a-z])', lambda m: f'</{m.group(1)}> {m.group(2)}', text)  # Keep lowercase after bold if mid-sentence
+            # Remove duplicate periods around bold tags
+            text = re.sub(r'\.\s*</(strong|b)>\s*\.', r'</\1>.', text)
+            text = re.sub(r'\.\s*<(strong|b)>\s*\.', r'. <\1>', text)
+            # Fix cases where bold content ends a sentence
+            # If bold is followed by a new sentence (capital letter), add period
+            text = re.sub(r'</(strong|b)>\s+([A-Z])', r'</\1>. \2', text)
+        # Don't remove these for list items
+        if not is_list_item_with_colon(text):
+            text = re.sub(r'<(strong|b)>\s*:\s*</\1>', ':', text)  # Remove empty bold colons
+            text = re.sub(r'<(strong|b)>\s*\.\s*</\1>', '.', text)  # Remove empty bold periods
+        return text
+    def extract_text_from_html(self, html_content):
+        """Extract text elements from HTML with skip logic"""
+        soup = BeautifulSoup(html_content, 'html.parser')
+        text_elements = []
+        # Get all text nodes using string instead of text (fixing deprecation)
+        for element in soup.find_all(string=True):
+            # Skip script, style, and noscript content completely
+            if element.parent.name in ['script', 'style', 'noscript']:
+                continue
+            text = element.strip()
+            if text and not self.should_skip_element(element, text):
+                text_elements.append({
+                    'text': text,
+                    'element': element
+                })
+        return soup, text_elements
+    def validate_and_fix_html(self, html_text):
+        """Fix common HTML syntax errors after processing"""
+        # Fix DOCTYPE
+        html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
+        # Fix spacing issues
+        html_text = re.sub(r'>\s+<', '><', html_text)  # Remove extra spaces between tags
+        html_text = re.sub(r'\s+>', '>', html_text)  # Remove spaces before closing >
+        html_text = re.sub(r'<\s+', '<', html_text)  # Remove spaces after opening <
+        # Fix common word errors that might occur during processing
+        html_text = html_text.replace('down loaded', 'downloaded')
+        html_text = html_text.replace('But your document', 'Your document')
+        return html_text
+    def wrap_keywords_in_paragraphs(self, soup, keywords):
+        """Wrap keywords with <strong> tags inside <p> tags only"""
+        if not keywords:
+            return
+        # Find all paragraph tags
+        for p_tag in soup.find_all('p'):
+            # Skip paragraphs that are inside special elements
+            # Check if paragraph is inside any of these elements
+            skip_parents = ['div.author-intro', 'div.cta-box', 'div.testimonial-card',
+                          'div.news-box', 'button', 'a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+                          'div.quiz-container', 'div.question-container', 'div.results']
+            # Check if this paragraph should be skipped
+            should_skip = False
+            for parent in p_tag.parents:
+                # Check by class
+                if parent.name == 'div' and parent.get('class'):
+                    classes = parent.get('class', [])
+                    if isinstance(classes, list):
+                        class_str = ' '.join(str(cls) for cls in classes)
+                    else:
+                        class_str = str(classes)
+                    if any(skip_class in class_str for skip_class in
+                          ['author-intro', 'cta-box', 'testimonial-card', 'news-box',
+                           'quiz-container', 'question-container', 'results', 'stats-grid',
+                           'toc-', 'comparison-tables']):
+                        should_skip = True
+                        break
+                # Check by tag name
+                if parent.name in ['button', 'a', 'blockquote', 'details', 'summary']:
+                    should_skip = True
+                    break
+            if should_skip:
+                continue
+            # Additional check: Skip if paragraph has specific classes
+            p_classes = p_tag.get('class', [])
+            if isinstance(p_classes, list):
+                p_class_str = ' '.join(str(cls) for cls in p_classes)
+            else:
+                p_class_str = str(p_classes)
+            if any(skip_class in p_class_str for skip_class in ['testimonial-card', 'quiz-', 'stat-']):
+                continue
+            # Process only if this is a regular content paragraph
+            # Get all text nodes in this paragraph
+            for text_node in p_tag.find_all(string=True):
+                # Skip if already inside a strong or b tag
+                if text_node.parent.name in ['strong', 'b', 'em', 'i', 'span', 'a']:
+                    continue
+                # Skip if the text node's immediate parent isn't the p tag
+                # (to avoid nested elements)
+                if text_node.parent != p_tag:
+                    continue
+                original_text = str(text_node)
+                # Skip very short text nodes
+                if len(original_text.strip()) < 20:
+                    continue
+                modified_text = original_text
+                # Check each keyword
+                for keyword in keywords:
+                    # Use word boundaries for accurate matching
+                    pattern = r'\b' + re.escape(keyword) + r'\b'
+                    # Find all matches (case-insensitive)
+                    matches = list(re.finditer(pattern, modified_text, flags=re.IGNORECASE))
+                    # Replace from end to beginning to maintain positions
+                    for match in reversed(matches):
+                        start, end = match.span()
+                        matched_text = match.group(0)
+                        # Wrap with strong tag
+                        modified_text = (modified_text[:start] +
+                                       f'<strong>{matched_text}</strong>' +
+                                       modified_text[end:])
+                # If text was modified, replace the text node
+                if modified_text != original_text:
+                    # Parse the modified text to create new nodes
+                    new_soup = BeautifulSoup(modified_text, 'html.parser')
+                    # Replace the text node with the new nodes
+                    for new_node in reversed(new_soup.contents):
+                        text_node.insert_after(new_node)
+                    text_node.extract()
+    def add_natural_flow_variations(self, text):
+        """Add more natural flow and rhythm variations for Originality AI"""
+        sentences = self.split_into_sentences_advanced(text)
+        enhanced_sentences = []
+        for i, sentence in enumerate(sentences):
+            if not sentence.strip():
+                continue
+            # Add stream-of-consciousness elements (10% chance)
+            if random.random() < 0.1 and len(sentence.split()) > 10:
+                stream_elements = [
+                    " - wait, let me back up - ",
+                    " - actually, scratch that - ",
+                    " - or maybe I should say - ",
+                    " - hmm, how do I put this - ",
+                    " - okay, here's the thing - ",
+                    " - you know what I mean? - "
+                ]
+                words = sentence.split()
+                pos = random.randint(len(words)//4, 3*len(words)//4)
+                words.insert(pos, random.choice(stream_elements))
+                sentence = ' '.join(words)
+            # Add human-like self-corrections (5% chance)
+            if random.random() < 0.05:
+                corrections = [
+                    " - or rather, ",
+                    " - well, actually, ",
+                    " - I mean, ",
+                    " - or should I say, ",
+                    " - correction: "
+                ]
+                words = sentence.split()
+                if len(words) > 8:
+                    pos = random.randint(len(words)//2, len(words)-3)
+                    correction = random.choice(corrections)
+                    # Repeat a concept with variation
+                    repeated_word_idx = random.randint(max(0, pos-5), pos-1)
+                    if repeated_word_idx < len(words):
+                        words.insert(pos, correction)
+                sentence = ' '.join(words)
+            # Add thinking-out-loud patterns (8% chance)
+            if random.random() < 0.08 and i > 0:
+                thinking_patterns = [
+                    "Come to think of it, ",
+                    "Actually, you know what? ",
+                    "Wait, here's a thought: ",
+                    "Oh, and another thing - ",
+                    "Speaking of which, ",
+                    "This reminds me, ",
+                    "Now that I mention it, ",
+                    "Funny you should ask, because "
+                ]
+                pattern = random.choice(thinking_patterns)
+                sentence = pattern + sentence[0].lower() + sentence[1:] if len(sentence) > 1 else sentence
+            enhanced_sentences.append(sentence)
+        return ' '.join(enhanced_sentences)
+    def process_html(self, html_content, primary_keywords="", secondary_keywords="", progress_callback=None):
+        """Main processing function with progress callback"""
+        if not html_content.strip():
+            return "Please provide HTML content."
+        # Store all script and style content to preserve it
+        script_placeholder = "###SCRIPT_PLACEHOLDER_{}###"
+        style_placeholder = "###STYLE_PLACEHOLDER_{}###"
+        preserved_scripts = []
+        preserved_styles = []
+        # Temporarily replace script and style tags with placeholders
+        soup_temp = BeautifulSoup(html_content, 'html.parser')
+        # Preserve all script tags
+        for idx, script in enumerate(soup_temp.find_all('script')):
+            placeholder = script_placeholder.format(idx)
+            preserved_scripts.append(str(script))
+            script.replace_with(placeholder)
+        # Preserve all style tags
+        for idx, style in enumerate(soup_temp.find_all('style')):
+            placeholder = style_placeholder.format(idx)
+            preserved_styles.append(str(style))
+            style.replace_with(placeholder)
+        # Get the modified HTML
+        html_content = str(soup_temp)
+        # Combine keywords and clean them
+        all_keywords = []
+        if primary_keywords:
+            # Clean and validate each keyword
+            for k in primary_keywords.split(','):
+                cleaned = k.strip()
+                if cleaned and len(cleaned) > 1:  # Skip empty or single-char keywords
+                    all_keywords.append(cleaned)
+        if secondary_keywords:
+            for k in secondary_keywords.split(','):
+                cleaned = k.strip()
+                if cleaned and len(cleaned) > 1:
+                    all_keywords.append(cleaned)
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_keywords = []
+        for k in all_keywords:
+            if k.lower() not in seen:
+                seen.add(k.lower())
+                unique_keywords.append(k)
+        all_keywords = unique_keywords
+        try:
+            # Extract text elements
+            soup, text_elements = self.extract_text_from_html(html_content)
+            total_elements = len(text_elements)
+            print(f"Found {total_elements} text elements to process (after filtering)")
+            if all_keywords:
+                print(f"Preserving keywords: {all_keywords}")
+            # Process each text element
+            processed_count = 0
+            for i, element_info in enumerate(text_elements):
+                original_text = element_info['text']
+                # Skip placeholders
+                if "###SCRIPT_PLACEHOLDER_" in original_text or "###STYLE_PLACEHOLDER_" in original_text:
+                    continue
+                # Skip very short texts
+                if len(original_text.split()) < 3:
+                    continue
+                # Debug: Check if keywords are in this text
+                text_has_keywords = any(keyword.lower() in original_text.lower() for keyword in all_keywords)
+                if text_has_keywords:
+                    print(f"Debug: Processing text with keywords: {original_text[:50]}...")
+                # First pass with Dipper (with adjusted diversity)
+                paraphrased_text = self.paraphrase_with_dipper(
+                    original_text,
+                    keywords=all_keywords
+                )
+                # Verify no placeholders remain
+                if '__KW' in paraphrased_text or '___' in paraphrased_text:
+                    print(f"Warning: Placeholder or underscores found in paraphrased text: {paraphrased_text[:100]}...")
+                    # Try to restore again with the enhanced function
+                    temp_map = {}
+                    for j, keyword in enumerate(all_keywords):
+                        temp_map[f'__KW{j:03d}__'] = keyword
+                    paraphrased_text = self.restore_keywords_robust(paraphrased_text, temp_map)
+                # Second pass with BART for longer texts (increased probability)
+                if self.use_bart and len(paraphrased_text.split()) > 8:
+                    # 50% chance to use BART for more variation (reduced from 60%)
+                    if random.random() < 0.5:
+                        paraphrased_text = self.paraphrase_with_bart(
+                            paraphrased_text,
+                            keywords=all_keywords
+                        )
+                # Apply sentence variation
+                paraphrased_text = self.apply_sentence_variation(paraphrased_text)
+                # Add natural flow variations
+                paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
+                # Fix punctuation and formatting
+                paraphrased_text = self.fix_punctuation(paraphrased_text)
+                # Final check for any remaining placeholders or underscores
+                if '___' in paraphrased_text or '__KW' in paraphrased_text:
+                    print(f"Error: Unresolved placeholders in final text")
+                    # Use original text if we can't resolve placeholders
+                    paraphrased_text = original_text
+                # Final quality check
+                if paraphrased_text and len(paraphrased_text.split()) >= 3:
+                    element_info['element'].replace_with(NavigableString(paraphrased_text))
+                    processed_count += 1
+                # Progress update
+                if progress_callback:
+                    progress_callback(i + 1, total_elements)
+                if i % 10 == 0 or i == total_elements - 1:
+                    progress = (i + 1) / total_elements * 100
+                    print(f"Progress: {progress:.1f}%")
+            # Wrap keywords with <strong> tags in paragraphs
+            self.wrap_keywords_in_paragraphs(soup, all_keywords)
+            # Post-process the entire HTML to fix bold/strong formatting
+            result = str(soup)
+            result = self.post_process_html(result)
+            # Final safety check for any remaining placeholders or underscores
+            if '__KW' in result or re.search(r'_{3,}', result):
+                print("Warning: Found placeholders or multiple underscores in final HTML output")
+                # Attempt to clean them with keywords
+                for i, keyword in enumerate(all_keywords):
+                    result = result.replace(f'__KW{i:03d}__', keyword)
+                    result = re.sub(r'_{3,}', keyword, result, count=1)
+            # Restore all script tags
+            for idx, script_content in enumerate(preserved_scripts):
+                placeholder = script_placeholder.format(idx)
+                result = result.replace(placeholder, script_content)
+            # Restore all style tags
+            for idx, style_content in enumerate(preserved_styles):
+                placeholder = style_placeholder.format(idx)
+                result = result.replace(placeholder, style_content)
+            # Validate and fix HTML syntax
+            result = self.validate_and_fix_html(result)
+            # Count skipped elements properly
+            all_text_elements = soup.find_all(string=True)
+            skipped = len([e for e in all_text_elements if e.strip() and e.parent.name not in ['script', 'style', 'noscript']]) - total_elements
+            print(f"Successfully processed {processed_count} text elements")
+            print(f"Skipped {skipped} elements (headings, CTAs, tables, testimonials, strong/bold tags, etc.)")
+            print(f"Preserved {len(preserved_scripts)} script tags and {len(preserved_styles)} style tags")
+            return result
+        except Exception as e:
+            import traceback
+            error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
+            print(error_msg)
+            # Return original HTML with error message prepended as HTML comment
+            return f"<!-- {error_msg} -->\n{html_content}"
+    def post_process_html(self, html_text):
+        """Post-process the entire HTML to fix formatting issues"""
+        # Fix empty angle brackets that might appear
+        html_text = re.sub(r'<>\s*([^<>]+?)\s*(?=\.|\s|<)', r'\1', html_text)  # Remove <> around text
+        html_text = re.sub(r'<>', '', html_text)  # Remove any remaining empty <>
+        # Fix double angle brackets around bold tags
+        html_text = re.sub(r'<<b>>', '<b>', html_text)
+        html_text = re.sub(r'<</b>>', '</b>', html_text)
+        html_text = re.sub(r'<<strong>>', '<strong>', html_text)
+        html_text = re.sub(r'<</strong>>', '</strong>', html_text)
+        # Fix periods around bold/strong tags
+        html_text = re.sub(r'\.\s*<(b|strong)>', '. <\1>', html_text)  # Period before bold
+        html_text = re.sub(r'</(b|strong)>\s*\.', '</\1>.', html_text)  # Period after bold
+        html_text = re.sub(r'\.<<(b|strong)>>', '. <\1>', html_text)  # Fix double bracket cases
+        html_text = re.sub(r'</(b|strong)>>\.', '</\1>.', html_text)
+        # Fix periods after colons
+        html_text = re.sub(r':\s*\.', ':', html_text)
+        html_text = re.sub(r'\.:', ':', html_text)
+        # Check if a line is a list item
+        def process_line(line):
+            # Check if this line contains a list pattern with bold
+            list_pattern = r'(?:^|\s)(?:[-•*▪▫◦‣⁃]\s*)?<(?:strong|b)>[^<]+:</(?:strong|b)>'
+            if re.search(list_pattern, line):
+                # This is a list item, preserve the colon format
+                return line
+            # Not a list item, apply regular fixes
+            # Remove periods immediately inside bold tags
+            line = re.sub(r'<(strong|b)>\s*\.\s*([^<]+)\s*\.\s*</\1>', r'<\1>\2</\1>', line)
+            # Fix sentence endings with bold
+            line = re.sub(r'</(strong|b)>\s*([.!?])', r'</\1>\2', line)
+            return line
+        # Process line by line to preserve list formatting
+        lines = html_text.split('\n')
+        processed_lines = [process_line(line) for line in lines]
+        html_text = '\n'.join(processed_lines)
+        # Fix sentence starts with bold
+        def fix_bold_sentence_start(match):
+            pre_context = match.group(1)
+            tag = match.group(2)
+            content = match.group(3)
+            # Skip if this is part of a list item with colon
+            full_match = match.group(0)
+            if ':' in full_match and '</' + tag + '>' in full_match:
+                return full_match
+            # Check if this should start with capital
+            if pre_context == '' or pre_context.endswith(('.', '!', '?', '>')):
+                if content and content[0].islower():
+                    content = content[0].upper() + content[1:]
+            return f'{pre_context}<{tag}>{content}'
+        # Look for bold/strong tags and check their context
+        html_text = re.sub(r'(^|.*?)(<(?:strong|b)>)([a-zA-Z])', fix_bold_sentence_start, html_text)
+        # Clean up spacing around bold tags (but preserve list formatting)
+        # Split into segments to handle list items separately
+        segments = re.split(r'(<(?:strong|b)>[^<]*:</(?:strong|b)>)', html_text)
+        cleaned_segments = []
+        for i, segment in enumerate(segments):
+            if i % 2 == 1:  # This is a list item pattern
+                cleaned_segments.append(segment)
+            else:
+                # Apply spacing fixes to non-list segments
+                segment = re.sub(r'\s+<(strong|b)>', r' <\1>', segment)
+                segment = re.sub(r'</(strong|b)>\s+', r'</\1> ', segment)
+                # Fix punctuation issues
+                segment = re.sub(r'([.,!?;:])\s*([.,!?;:])', r'\1', segment)
+                # Fix periods inside/around bold
+                segment = re.sub(r'\.<(strong|b)>\.', '. <\1>', segment)
+                segment = re.sub(r'\.</(strong|b)>\.', '</\1>.', segment)
+                cleaned_segments.append(segment)
+        html_text = ''.join(cleaned_segments)
+        # Final cleanup
+        html_text = re.sub(r'\.{2,}', '.', html_text)  # Multiple periods
+        html_text = re.sub(r',{2,}', ',', html_text)  # Multiple commas
+        html_text = re.sub(r':{2,}', ':', html_text)  # Multiple colons
+        html_text = re.sub(r'\s+([.,!?;:])', r'\1', html_text)  # Space before punctuation
+        # Fix empty bold tags (but not those with just colons)
+        html_text = re.sub(r'<(strong|b)>\s*</\1>', '', html_text)
+        # Fix specific patterns in lists/stats
+        # Pattern like "5,000+" should not have period after
+        html_text = re.sub(r'(\d+[,\d]*\+?)\s*\.\s*\n', r'\1\n', html_text)
+        # Clean up any remaining double brackets
+        html_text = re.sub(r'<<', '<', html_text)
+        html_text = re.sub(r'>>', '>', html_text)
+        # Apply final minimal grammar fixes
+        html_text = self.grammar_fixer.smart_fix(html_text)
+        return html_text
+# Initialize the humanizer
+humanizer = EnhancedDipperHumanizer()
+def humanize_html(html_input, primary_keywords="", secondary_keywords="", progress=gr.Progress()):
+    """Gradio interface function with progress updates"""
+    if not html_input:
+        return "Please provide HTML content to humanize."
+    progress(0, desc="Starting processing...")
+    start_time = time.time()
+    # Create a wrapper to update progress
+    def progress_callback(current, total):
+        if total > 0:
+            progress(current / total, desc=f"Processing: {current}/{total} elements")
+    # Pass progress callback to process_html
+    result = humanizer.process_html(
+        html_input,
+        primary_keywords,
+        secondary_keywords,
+        progress_callback=progress_callback
+    )
+    processing_time = time.time() - start_time
+    print(f"Processing completed in {processing_time:.2f} seconds")
+    progress(1.0, desc="Complete!")
+    return result
+# Create Gradio interface with queue
+iface = gr.Interface(
+    fn=humanize_html,
+    inputs=[
+        gr.Textbox(
+            lines=10,
+            placeholder="Paste your HTML content here...",
+            label="HTML Input"
+        ),
+        gr.Textbox(
+            placeholder="Enter primary keywords separated by commas (e.g., GMAT Focus Edition, MBA, Data Insights)",
+            label="Primary Keywords (preserved exactly)"
+        ),
+        gr.Textbox(
+            placeholder="Enter secondary keywords separated by commas (e.g., test preparation, business school)",
+            label="Secondary Keywords (preserved exactly)"
+        )
+    ],
+    outputs=gr.Textbox(
+        lines=10,
+        label="Humanized HTML Output"
+    ),
+    title="Enhanced Dipper AI Humanizer - Optimized for Originality AI",
+    description="""
+    Ultra-aggressive humanizer optimized to achieve 100% human scores on both Undetectable AI and Originality AI.
+    Key Features:
+    - Maximum diversity settings (90% lexical, 40% order) for natural variation
+    - Enhanced human patterns: personal opinions, self-corrections, thinking-out-loud
+    - Natural typos, contractions, and conversational flow
+    - Stream-of-consciousness elements and rhetorical questions
+    - Originality AI-specific optimizations: varied sentence starters, emphatic repetitions
+    - Fixed placeholder system that preserves keywords
+    - Keywords inside <p> tags are automatically wrapped with <strong> tags
+    - Skips content in <strong>, <b>, and heading tags (including inside tables)
+    - Designed to pass the strictest AI detection systems
+    The tool creates genuinely human-like writing patterns that fool even the most sophisticated detectors!
+    ⚠️ Note: Processing may take 5-10 minutes for large HTML documents.
+    """,
+    examples=[
+        ["""<article>
+<h1>The Benefits of Regular Exercise</h1>
+<div class="author-intro">By John Doe, Fitness Expert | 10 years experience</div>
+<p>Regular exercise is essential for maintaining good health. It helps improve cardiovascular fitness, strengthens muscles, and enhances mental well-being. Studies have shown that people who exercise regularly have lower risks of chronic diseases.</p>
+<p>Additionally, exercise can boost mood and energy levels. It releases endorphins, which are natural mood elevators. Even moderate activities like walking can make a significant difference in overall health.</p>
+</article>""", "cardiovascular fitness, mental well-being, chronic diseases", "exercise, health, endorphins"]
+    ],
+    theme="default"
+)
+if __name__ == "__main__":
+    # Enable queue for better handling of long-running processes
+    iface.queue(max_size=10)
+    iface.launch(share=True)