Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline, AutoTokenizer | |
| # --- MODEL LOADING --- | |
| # Load both the pipeline and the tokenizer for the model | |
| # The tokenizer is needed to split the text into chunks the model can understand. | |
| model_name = "openai-community/roberta-base-openai-detector" | |
| pipe = pipeline("text-classification", model=model_name) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| def detect_ai_text(text): | |
| """ | |
| Analyzes input text, handling long texts by chunking them into smaller pieces. | |
| """ | |
| # Get the model's max length, subtracting a few tokens for safety margin. | |
| max_length = tokenizer.model_max_length - 2 | |
| # Tokenize the entire input text | |
| tokens = tokenizer.encode(text) | |
| # If the text is short enough, process it in one go. | |
| if len(tokens) <= max_length: | |
| results = pipe(text) | |
| return {item['label']: item['score'] for item in results} | |
| # --- CHUNKING LOGIC FOR LONG TEXT --- | |
| # If the text is too long, we process it in overlapping chunks. | |
| all_scores = [] | |
| # Create chunks with a 50-token overlap to maintain context between them | |
| for i in range(0, len(tokens), max_length - 50): | |
| chunk_tokens = tokens[i:i + max_length] | |
| # Decode the chunk tokens back to a string for the pipeline | |
| chunk_text = tokenizer.decode(chunk_tokens) | |
| # Run the model on the chunk | |
| chunk_results = pipe(chunk_text) | |
| # Find the score for the 'AI_GENERATED' label (LABEL_1) | |
| for item in chunk_results: | |
| if item['label'] == 'LABEL_1': # LABEL_1 is the AI score | |
| all_scores.append(item['score']) | |
| break # Move to the next chunk | |
| # If for some reason no scores were collected, return an error state. | |
| if not all_scores: | |
| return {"error": "Could not process text."} | |
| # Average the AI scores from all chunks to get a final score | |
| average_ai_score = sum(all_scores) / len(all_scores) | |
| # Return the aggregated result in the same format as a single run | |
| return { | |
| 'LABEL_1': average_ai_score, # AI score | |
| 'LABEL_0': 1 - average_ai_score, # Human score | |
| 'note': f'Result aggregated from {len(all_scores)} chunks.' | |
| } | |
| # --- GRADIO INTERFACE --- | |
| iface = gr.Interface( | |
| fn=detect_ai_text, | |
| inputs=gr.Textbox(lines=15, placeholder="Paste the text you want to analyze here..."), | |
| outputs="json", | |
| title="AI Content Detector (Robust Version)", | |
| description="This version handles long texts by breaking them into chunks. It analyzes text for AI generation using the roberta-base-openai-detector model." | |
| ) | |
| # Launch the app | |
| iface.launch() |