iyadalagha commited on
Commit
7e73b77
·
1 Parent(s): 861190f

handle both ar and eng

Browse files
Files changed (4) hide show
  1. Dockerfile +5 -23
  2. README.md +28 -7
  3. app.py +77 -177
  4. requirements.txt +2 -1
Dockerfile CHANGED
@@ -1,29 +1,11 @@
1
- # Use Python 3.9 as the base image
2
- FROM python:3.9
3
 
4
- # Set working directory in the container
5
  WORKDIR /app
6
 
7
- # Create a non-root user and set permissions
8
- RUN useradd -m myuser && chown -R myuser:myuser /app
9
- USER myuser
10
-
11
- # Set Hugging Face cache directory
12
- ENV HF_HOME=/app/.cache/huggingface
13
-
14
- # Update PATH for uvicorn
15
- ENV PATH="/home/myuser/.local/bin:${PATH}"
16
-
17
- # Copy requirements.txt and install dependencies
18
- COPY --chown=myuser:myuser requirements.txt .
19
  RUN pip install --no-cache-dir -r requirements.txt
20
 
21
- # Clear cache and pre-download models
22
-
23
- RUN rm -rf /app/.cache/huggingface/* && python -c "from transformers import pipeline; pipeline('text-classification', model='desklib/ai-text-detector-v1.01'); pipeline('text-classification', model='akshayvkt/detect-ai-text'); pipeline('text-classification', model='sabaridsnfuji/arabic-ai-text-detector')"
24
-
25
- # Copy the application code
26
- COPY --chown=myuser:myuser . .
27
 
28
- # Run the FastAPI app with Uvicorn
29
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ FROM python:3.11-slim
 
2
 
 
3
  WORKDIR /app
4
 
5
+ COPY requirements.txt .
 
 
 
 
 
 
 
 
 
 
 
6
  RUN pip install --no-cache-dir -r requirements.txt
7
 
8
+ COPY . .
 
 
 
 
 
9
 
10
+ EXPOSE 7860
11
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,9 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
- title: AI Text Detector
3
- emoji: 🤖
4
- colorFrom: blue
5
- colorTo: green
6
- sdk: docker
7
- app_port: 7860
 
 
 
8
  ---
9
- A FastAPI app using akshayvkt/detect-ai-text to classify text as AI-generated or human-written for IUG.
 
 
 
 
 
 
1
+ # 🤖 AI Text Detector (Improved)
2
+
3
+ This project provides a **FastAPI app** for detecting whether a given text is **AI-generated** or **human-written**.
4
+ Unlike the original version, this detector uses **two signals**:
5
+
6
+ 1. **Classifier score** from [`Hello-SimpleAI/chatgpt-detector-roberta`](https://huggingface.co/Hello-SimpleAI/chatgpt-detector-roberta)
7
+ → Probability that the text is AI-generated.
8
+
9
+ 2. **Perplexity score** using **GPT-2**
10
+ → Measures how “predictable” the text is. Lower perplexity often indicates AI-like fluency.
11
+
12
+ The app then combines both scores into a **final label**: `AI`, `Human`, or `Uncertain`.
13
+
14
  ---
15
+
16
+ ## 🔧 Tech Stack
17
+ - Python 3.11
18
+ - FastAPI
19
+ - Hugging Face Transformers
20
+ - PyTorch
21
+ - Uvicorn
22
+ - SciPy (for perplexity math)
23
+
24
  ---
25
+
26
+ ## 🚀 Running the App
27
+
28
+ ### 1. Build the Docker image
29
+ ```bash
30
+ docker build -t ai-detector .
app.py CHANGED
@@ -1,181 +1,81 @@
1
- from fastapi import FastAPI, HTTPException
2
- from pydantic import BaseModel, validator
3
- import re
4
  import torch
5
- from transformers import pipeline
6
- from collections import Counter
7
- import logging
8
- import numpy as np
9
-
10
- # Configure logging with more detail
11
- logging.basicConfig(filename="predictions.log", level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
12
-
13
- app = FastAPI()
14
-
15
- # Enable GPU if available, else use CPU
16
- device = 0 if torch.cuda.is_available() else -1
17
- torch.manual_seed(42)
18
-
19
- # Load AI detection models
20
- english_detectors = [
21
- pipeline("text-classification", model="desklib/ai-text-detector-v1.01", truncation=True, max_length=512)
22
- pipeline("text-classification", model="akshayvkt/detect-ai-text", device=device, truncation=True, max_length=512),
23
- ]
24
- arabic_detector = pipeline("text-classification", model="sabaridsnfuji/arabic-ai-text-detector", device=device, truncation=True, max_length=512)
25
-
26
- def detect_language(text: str) -> str:
27
- """Detect if text is Arabic or English based on Unicode character ranges."""
28
- arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))
29
- latin_chars = len(re.findall(r'[A-Za-z]', text))
30
- total_chars = arabic_chars + latin_chars
31
- if total_chars == 0:
32
- return 'en'
33
- arabic_ratio = arabic_chars / total_chars
34
- return 'ar' if arabic_ratio > 0.5 else 'en'
35
-
36
- def calculate_burstiness(text: str) -> float:
37
- """Calculate burstiness (std/mean of sentence lengths) to bias toward human text."""
38
- sentences = re.split(r'[.!?]', text)
39
- lengths = [len(s.split()) for s in sentences if s]
40
- return np.std(lengths) / (np.mean(lengths) + 1e-6) if lengths else 0
41
-
42
- def clean_text(text: str, language: str) -> str:
43
- """Clean text by removing special characters and normalizing spaces. Skip lowercase for Arabic."""
44
- text = re.sub(r'\s+', ' ', text)
45
- text = re.sub(r'[^\w\s.,!?]', '', text)
46
- text = text.strip()
47
- if language == 'en':
48
- text = text.lower()
49
- return text
50
-
51
- def split_text(text: str, max_chars: int = 5000) -> list:
52
- """Split text into chunks of max_chars, preserving sentence boundaries."""
53
- sentences = re.split(r'(?<=[.!?])\s+', text)
54
- chunks = []
55
- current_chunk = ""
56
- for sentence in sentences:
57
- if len(current_chunk) + len(sentence) <= max_chars:
58
- current_chunk += sentence + " "
59
- else:
60
- if current_chunk:
61
- chunks.append(current_chunk.strip())
62
- current_chunk = sentence + " "
63
- if current_chunk:
64
- chunks.append(current_chunk.strip())
65
- return chunks
66
-
67
- class TextInput(BaseModel):
68
  text: str
69
 
70
- @validator("text")
71
- def validate_text(cls, value):
72
- """Validate input text for minimum length and content."""
73
- word_count = len(value.split())
74
- if word_count < 50:
75
- raise ValueError(f"Text too short ({word_count} words). Minimum 50 words required.")
76
- if not re.search(r'[\w]', value):
77
- raise ValueError("Text must contain alphabetic characters.")
78
- return value
79
-
80
- @app.post("/predict")
81
- def predict(input: TextInput):
82
- detected_lang = detect_language(input.text)
83
- note_lang = f"Detected language: {'Arabic' if detected_lang == 'ar' else 'English'}"
84
- cleaned_text = clean_text(input.text, detected_lang)
85
- burstiness = calculate_burstiness(cleaned_text)
86
- note_burst = f"Burstiness: {burstiness:.2f} (high suggests human)"
87
-
88
- if detected_lang == 'ar':
89
- detector = arabic_detector
90
- is_ensemble = False
91
- else:
92
- detector = english_detectors
93
- is_ensemble = True
94
-
95
- if len(cleaned_text) > 10000:
96
- chunks = split_text(cleaned_text, max_chars=5000)
97
- if is_ensemble:
98
- all_results = [det(chunks, truncation=True, max_length=512) for det in detector]
99
- labels = []
100
- scores = []
101
- for chunk_idx in range(len(chunks)):
102
- chunk_labels = []
103
- chunk_scores = []
104
- for det_idx, det_results in enumerate(all_results):
105
- score = det_results[chunk_idx]['score']
106
- label = "AI" if score >= 0.98 else "Human" if score < 0.55 else "Uncertain"
107
- chunk_labels.append(label)
108
- chunk_scores.append(score)
109
- logging.debug(f"Chunk {chunk_idx}, Model {det_idx}: Label={label}, Score={score:.4f}")
110
- chunk_final_label = Counter(chunk_labels).most_common(1)[0][0]
111
- if chunk_final_label == "Uncertain" or len(set(chunk_labels)) == len(detector) or any(l == "Human" for l in chunk_labels): # Prioritize Human if any model predicts it
112
- chunk_final_label = "Human" if burstiness > 1.5 else "Uncertain"
113
- labels.append(chunk_final_label)
114
- scores.append(np.mean(chunk_scores))
115
- logging.debug(f"Chunk {chunk_idx} Final: Label={chunk_final_label}, Avg Score={np.mean(chunk_scores):.4f}, Burstiness={burstiness:.2f}")
116
- label_counts = Counter(labels)
117
- final_label = label_counts.most_common(1)[0][0]
118
- if final_label == "Uncertain" or len(set(labels)) == len(detector) or any(l == "Human" for l in labels):
119
- final_label = "Human" if burstiness > 1.5 else "Uncertain"
120
- avg_score = sum(scores) / len(scores) if scores else 0.0
121
- logging.info(f"Language: {detected_lang} | Text Length: {len(cleaned_text)} | Chunks: {len(chunks)} | Prediction: {final_label} | Score: {avg_score:.4f} | Burstiness: {burstiness:.2f}")
122
- return {
123
- "prediction": final_label,
124
- "score": avg_score,
125
- "note": f"{note_lang}. Text was split into {len(chunks)} chunks due to length > 10,000 characters. {note_burst}.",
126
- "chunk_results": [
127
- {"chunk": chunk[:50] + "...", "label": labels[i], "score": scores[i], "burstiness": burstiness}
128
- for i, chunk in enumerate(chunks)
129
- ]
130
- }
131
- else:
132
- results = detector(chunks, truncation=True, max_length=512)
133
- labels = ["AI" if res['score'] >= 0.95 else "Human" if res['score'] < 0.60 else "Uncertain" for res in results]
134
- if any(l == "Uncertain" for l in labels) or any(l == "Human" for l in labels):
135
- labels = ["Human" if l == "Uncertain" or l == "Human" else l for l in labels if burstiness > 1.0]
136
- label_counts = Counter(labels)
137
- final_label = label_counts.most_common(1)[0][0]
138
- scores = [res['score'] for res, label in zip(results, labels) if label == final_label]
139
- avg_score = sum(scores) / len(scores) if scores else 0.0
140
- logging.info(f"Language: {detected_lang} | Text Length: {len(cleaned_text)} | Chunks: {len(chunks)} | Prediction: {final_label} | Score: {avg_score:.4f} | Burstiness: {burstiness:.2f}")
141
- return {
142
- "prediction": final_label,
143
- "score": avg_score,
144
- "note": f"{note_lang}. Text was split into {len(chunks)} chunks due to length > 10,000 characters. {note_burst}.",
145
- "chunk_results": [
146
- {"chunk": chunk[:50] + "...", "label": labels[i], "score": results[i]['score'], "burstiness": burstiness}
147
- for i, chunk in enumerate(chunks)
148
- ]
149
- }
150
  else:
151
- if is_ensemble:
152
- results = [det(cleaned_text, truncation=True, max_length=512)[0] for det in detector]
153
- labels = []
154
- scores = []
155
- for det_idx, result in enumerate(results):
156
- score = result['score']
157
- label = "AI" if score >= 0.98 else "Human" if score < 0.55 else "Uncertain"
158
- labels.append(label)
159
- scores.append(score)
160
- logging.debug(f"Model {det_idx}: Label={label}, Score={score:.4f}")
161
- label_counts = Counter(labels)
162
- final_label = label_counts.most_common(1)[0][0]
163
- if final_label == "Uncertain" or len(set(labels)) == len(detector) or any(l == "Human" for l in labels):
164
- final_label = "Human" if burstiness > 1.5 else "Uncertain"
165
- avg_score = sum(scores) / len(scores) if scores else 0.0
166
- note = f"{note_lang}. Ensemble used: {len(detector)} models. {note_burst}."
167
- if 0.55 <= avg_score < 0.98:
168
- note += " Warning: Close to threshold, result may be uncertain."
169
- logging.info(f"Language: {detected_lang} | Text Length: {len(cleaned_text)} | Prediction: {final_label} | Score: {avg_score:.4f} | Burstiness: {burstiness:.2f} | Model Scores: {scores}")
170
- else:
171
- result = detector(cleaned_text, truncation=True, max_length=512)[0]
172
- score = result['score']
173
- final_label = "AI" if score >= 0.95 else "Human" if score < 0.60 else "Uncertain"
174
- if final_label == "Uncertain" or final_label == "Human":
175
- final_label = "Human" if burstiness > 1.0 else "Uncertain"
176
- avg_score = score
177
- note = f"{note_lang}. {note_burst}."
178
- if 0.60 <= score < 0.95:
179
- note += " Warning: Close to threshold, result may be uncertain."
180
- logging.info(f"Language: {detected_lang} | Text Length: {len(cleaned_text)} | Prediction: {final_label} | Score: {avg_score:.4f} | Burstiness: {burstiness:.2f}")
181
- return {"prediction": final_label, "score": avg_score, "note": note}
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM
4
  import torch
5
+ import math
6
+
7
+ app = FastAPI(title="Improved AI Text Detector")
8
+
9
+ # 1. Classifier model (better than akshayvkt)
10
+ clf_model_name = "Hello-SimpleAI/chatgpt-detector-roberta"
11
+ clf_tokenizer = AutoTokenizer.from_pretrained(clf_model_name)
12
+ clf_model = AutoModelForSequenceClassification.from_pretrained(clf_model_name)
13
+
14
+ # 2. Perplexity model (GPT-2)
15
+ ppl_model_name = "gpt2"
16
+ ppl_tokenizer = AutoTokenizer.from_pretrained(ppl_model_name)
17
+ ppl_model = AutoModelForCausalLM.from_pretrained(ppl_model_name)
18
+
19
+ class InputText(BaseModel):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  text: str
21
 
22
+ def get_classifier_score(text: str) -> float:
23
+ inputs = clf_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
24
+ with torch.no_grad():
25
+ outputs = clf_model(**inputs)
26
+ probs = torch.softmax(outputs.logits, dim=-1)
27
+ ai_prob = probs[0][1].item() # label 1 = AI
28
+ return ai_prob
29
+
30
+ def get_perplexity(text: str) -> float:
31
+ encodings = ppl_tokenizer(text, return_tensors="pt")
32
+ max_length = ppl_model.config.n_positions
33
+ stride = 512
34
+ seq_len = encodings.input_ids.size(1)
35
+
36
+ nlls = []
37
+ prev_end_loc = 0
38
+ for begin_loc in range(0, seq_len, stride):
39
+ end_loc = min(begin_loc + stride, seq_len)
40
+ trg_len = end_loc - prev_end_loc
41
+ input_ids = encodings.input_ids[:, begin_loc:end_loc]
42
+ target_ids = input_ids.clone()
43
+ target_ids[:, :-trg_len] = -100
44
+
45
+ with torch.no_grad():
46
+ outputs = ppl_model(input_ids, labels=target_ids)
47
+ neg_log_likelihood = outputs.loss * trg_len
48
+
49
+ nlls.append(neg_log_likelihood)
50
+ prev_end_loc = end_loc
51
+
52
+ if end_loc == seq_len:
53
+ break
54
+
55
+ ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
56
+ return ppl.item()
57
+
58
+ @app.post("/detect")
59
+ def detect(input_text: InputText):
60
+ text = input_text.text.strip()
61
+
62
+ # Run classifier
63
+ clf_score = get_classifier_score(text)
64
+
65
+ # Run perplexity
66
+ ppl = get_perplexity(text)
67
+
68
+ # Decision rule: combine both
69
+ # Lower perplexity (<50) + high classifier_score (>0.7) = AI
70
+ if clf_score > 0.7 and ppl < 50:
71
+ final = "AI"
72
+ elif clf_score < 0.3 and ppl > 80:
73
+ final = "Human"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  else:
75
+ final = "Uncertain"
76
+
77
+ return {
78
+ "classifier_score": round(clf_score, 4),
79
+ "perplexity": round(ppl, 2),
80
+ "final_label": final
81
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -3,4 +3,5 @@ torch==2.4.1
3
  fastapi==0.115.2
4
  uvicorn==0.32.0
5
  pydantic==2.9.2
6
- numpy==2.0.2
 
 
3
  fastapi==0.115.2
4
  uvicorn==0.32.0
5
  pydantic==2.9.2
6
+ numpy==2.0.2
7
+ scipy==1.14.1