Spaces:
Sleeping
Sleeping
Commit
·
7e73b77
1
Parent(s):
861190f
handle both ar and eng
Browse files- Dockerfile +5 -23
- README.md +28 -7
- app.py +77 -177
- requirements.txt +2 -1
Dockerfile
CHANGED
@@ -1,29 +1,11 @@
|
|
1 |
-
|
2 |
-
FROM python:3.9
|
3 |
|
4 |
-
# Set working directory in the container
|
5 |
WORKDIR /app
|
6 |
|
7 |
-
|
8 |
-
RUN useradd -m myuser && chown -R myuser:myuser /app
|
9 |
-
USER myuser
|
10 |
-
|
11 |
-
# Set Hugging Face cache directory
|
12 |
-
ENV HF_HOME=/app/.cache/huggingface
|
13 |
-
|
14 |
-
# Update PATH for uvicorn
|
15 |
-
ENV PATH="/home/myuser/.local/bin:${PATH}"
|
16 |
-
|
17 |
-
# Copy requirements.txt and install dependencies
|
18 |
-
COPY --chown=myuser:myuser requirements.txt .
|
19 |
RUN pip install --no-cache-dir -r requirements.txt
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
RUN rm -rf /app/.cache/huggingface/* && python -c "from transformers import pipeline; pipeline('text-classification', model='desklib/ai-text-detector-v1.01'); pipeline('text-classification', model='akshayvkt/detect-ai-text'); pipeline('text-classification', model='sabaridsnfuji/arabic-ai-text-detector')"
|
24 |
-
|
25 |
-
# Copy the application code
|
26 |
-
COPY --chown=myuser:myuser . .
|
27 |
|
28 |
-
|
29 |
-
CMD ["uvicorn", "
|
|
|
1 |
+
FROM python:3.11-slim
|
|
|
2 |
|
|
|
3 |
WORKDIR /app
|
4 |
|
5 |
+
COPY requirements.txt .
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
RUN pip install --no-cache-dir -r requirements.txt
|
7 |
|
8 |
+
COPY . .
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
EXPOSE 7860
|
11 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
@@ -1,9 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
8 |
---
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 🤖 AI Text Detector (Improved)
|
2 |
+
|
3 |
+
This project provides a **FastAPI app** for detecting whether a given text is **AI-generated** or **human-written**.
|
4 |
+
Unlike the original version, this detector uses **two signals**:
|
5 |
+
|
6 |
+
1. **Classifier score** from [`Hello-SimpleAI/chatgpt-detector-roberta`](https://huggingface.co/Hello-SimpleAI/chatgpt-detector-roberta)
|
7 |
+
→ Probability that the text is AI-generated.
|
8 |
+
|
9 |
+
2. **Perplexity score** using **GPT-2**
|
10 |
+
→ Measures how “predictable” the text is. Lower perplexity often indicates AI-like fluency.
|
11 |
+
|
12 |
+
The app then combines both scores into a **final label**: `AI`, `Human`, or `Uncertain`.
|
13 |
+
|
14 |
---
|
15 |
+
|
16 |
+
## 🔧 Tech Stack
|
17 |
+
- Python 3.11
|
18 |
+
- FastAPI
|
19 |
+
- Hugging Face Transformers
|
20 |
+
- PyTorch
|
21 |
+
- Uvicorn
|
22 |
+
- SciPy (for perplexity math)
|
23 |
+
|
24 |
---
|
25 |
+
|
26 |
+
## 🚀 Running the App
|
27 |
+
|
28 |
+
### 1. Build the Docker image
|
29 |
+
```bash
|
30 |
+
docker build -t ai-detector .
|
app.py
CHANGED
@@ -1,181 +1,81 @@
|
|
1 |
-
from fastapi import FastAPI
|
2 |
-
from pydantic import BaseModel
|
3 |
-
import
|
4 |
import torch
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
english_detectors = [
|
21 |
-
pipeline("text-classification", model="desklib/ai-text-detector-v1.01", truncation=True, max_length=512)
|
22 |
-
pipeline("text-classification", model="akshayvkt/detect-ai-text", device=device, truncation=True, max_length=512),
|
23 |
-
]
|
24 |
-
arabic_detector = pipeline("text-classification", model="sabaridsnfuji/arabic-ai-text-detector", device=device, truncation=True, max_length=512)
|
25 |
-
|
26 |
-
def detect_language(text: str) -> str:
|
27 |
-
"""Detect if text is Arabic or English based on Unicode character ranges."""
|
28 |
-
arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))
|
29 |
-
latin_chars = len(re.findall(r'[A-Za-z]', text))
|
30 |
-
total_chars = arabic_chars + latin_chars
|
31 |
-
if total_chars == 0:
|
32 |
-
return 'en'
|
33 |
-
arabic_ratio = arabic_chars / total_chars
|
34 |
-
return 'ar' if arabic_ratio > 0.5 else 'en'
|
35 |
-
|
36 |
-
def calculate_burstiness(text: str) -> float:
|
37 |
-
"""Calculate burstiness (std/mean of sentence lengths) to bias toward human text."""
|
38 |
-
sentences = re.split(r'[.!?]', text)
|
39 |
-
lengths = [len(s.split()) for s in sentences if s]
|
40 |
-
return np.std(lengths) / (np.mean(lengths) + 1e-6) if lengths else 0
|
41 |
-
|
42 |
-
def clean_text(text: str, language: str) -> str:
|
43 |
-
"""Clean text by removing special characters and normalizing spaces. Skip lowercase for Arabic."""
|
44 |
-
text = re.sub(r'\s+', ' ', text)
|
45 |
-
text = re.sub(r'[^\w\s.,!?]', '', text)
|
46 |
-
text = text.strip()
|
47 |
-
if language == 'en':
|
48 |
-
text = text.lower()
|
49 |
-
return text
|
50 |
-
|
51 |
-
def split_text(text: str, max_chars: int = 5000) -> list:
|
52 |
-
"""Split text into chunks of max_chars, preserving sentence boundaries."""
|
53 |
-
sentences = re.split(r'(?<=[.!?])\s+', text)
|
54 |
-
chunks = []
|
55 |
-
current_chunk = ""
|
56 |
-
for sentence in sentences:
|
57 |
-
if len(current_chunk) + len(sentence) <= max_chars:
|
58 |
-
current_chunk += sentence + " "
|
59 |
-
else:
|
60 |
-
if current_chunk:
|
61 |
-
chunks.append(current_chunk.strip())
|
62 |
-
current_chunk = sentence + " "
|
63 |
-
if current_chunk:
|
64 |
-
chunks.append(current_chunk.strip())
|
65 |
-
return chunks
|
66 |
-
|
67 |
-
class TextInput(BaseModel):
|
68 |
text: str
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
return {
|
123 |
-
"prediction": final_label,
|
124 |
-
"score": avg_score,
|
125 |
-
"note": f"{note_lang}. Text was split into {len(chunks)} chunks due to length > 10,000 characters. {note_burst}.",
|
126 |
-
"chunk_results": [
|
127 |
-
{"chunk": chunk[:50] + "...", "label": labels[i], "score": scores[i], "burstiness": burstiness}
|
128 |
-
for i, chunk in enumerate(chunks)
|
129 |
-
]
|
130 |
-
}
|
131 |
-
else:
|
132 |
-
results = detector(chunks, truncation=True, max_length=512)
|
133 |
-
labels = ["AI" if res['score'] >= 0.95 else "Human" if res['score'] < 0.60 else "Uncertain" for res in results]
|
134 |
-
if any(l == "Uncertain" for l in labels) or any(l == "Human" for l in labels):
|
135 |
-
labels = ["Human" if l == "Uncertain" or l == "Human" else l for l in labels if burstiness > 1.0]
|
136 |
-
label_counts = Counter(labels)
|
137 |
-
final_label = label_counts.most_common(1)[0][0]
|
138 |
-
scores = [res['score'] for res, label in zip(results, labels) if label == final_label]
|
139 |
-
avg_score = sum(scores) / len(scores) if scores else 0.0
|
140 |
-
logging.info(f"Language: {detected_lang} | Text Length: {len(cleaned_text)} | Chunks: {len(chunks)} | Prediction: {final_label} | Score: {avg_score:.4f} | Burstiness: {burstiness:.2f}")
|
141 |
-
return {
|
142 |
-
"prediction": final_label,
|
143 |
-
"score": avg_score,
|
144 |
-
"note": f"{note_lang}. Text was split into {len(chunks)} chunks due to length > 10,000 characters. {note_burst}.",
|
145 |
-
"chunk_results": [
|
146 |
-
{"chunk": chunk[:50] + "...", "label": labels[i], "score": results[i]['score'], "burstiness": burstiness}
|
147 |
-
for i, chunk in enumerate(chunks)
|
148 |
-
]
|
149 |
-
}
|
150 |
else:
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
labels.append(label)
|
159 |
-
scores.append(score)
|
160 |
-
logging.debug(f"Model {det_idx}: Label={label}, Score={score:.4f}")
|
161 |
-
label_counts = Counter(labels)
|
162 |
-
final_label = label_counts.most_common(1)[0][0]
|
163 |
-
if final_label == "Uncertain" or len(set(labels)) == len(detector) or any(l == "Human" for l in labels):
|
164 |
-
final_label = "Human" if burstiness > 1.5 else "Uncertain"
|
165 |
-
avg_score = sum(scores) / len(scores) if scores else 0.0
|
166 |
-
note = f"{note_lang}. Ensemble used: {len(detector)} models. {note_burst}."
|
167 |
-
if 0.55 <= avg_score < 0.98:
|
168 |
-
note += " Warning: Close to threshold, result may be uncertain."
|
169 |
-
logging.info(f"Language: {detected_lang} | Text Length: {len(cleaned_text)} | Prediction: {final_label} | Score: {avg_score:.4f} | Burstiness: {burstiness:.2f} | Model Scores: {scores}")
|
170 |
-
else:
|
171 |
-
result = detector(cleaned_text, truncation=True, max_length=512)[0]
|
172 |
-
score = result['score']
|
173 |
-
final_label = "AI" if score >= 0.95 else "Human" if score < 0.60 else "Uncertain"
|
174 |
-
if final_label == "Uncertain" or final_label == "Human":
|
175 |
-
final_label = "Human" if burstiness > 1.0 else "Uncertain"
|
176 |
-
avg_score = score
|
177 |
-
note = f"{note_lang}. {note_burst}."
|
178 |
-
if 0.60 <= score < 0.95:
|
179 |
-
note += " Warning: Close to threshold, result may be uncertain."
|
180 |
-
logging.info(f"Language: {detected_lang} | Text Length: {len(cleaned_text)} | Prediction: {final_label} | Score: {avg_score:.4f} | Burstiness: {burstiness:.2f}")
|
181 |
-
return {"prediction": final_label, "score": avg_score, "note": note}
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from pydantic import BaseModel
|
3 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM
|
4 |
import torch
|
5 |
+
import math
|
6 |
+
|
7 |
+
app = FastAPI(title="Improved AI Text Detector")
|
8 |
+
|
9 |
+
# 1. Classifier model (better than akshayvkt)
|
10 |
+
clf_model_name = "Hello-SimpleAI/chatgpt-detector-roberta"
|
11 |
+
clf_tokenizer = AutoTokenizer.from_pretrained(clf_model_name)
|
12 |
+
clf_model = AutoModelForSequenceClassification.from_pretrained(clf_model_name)
|
13 |
+
|
14 |
+
# 2. Perplexity model (GPT-2)
|
15 |
+
ppl_model_name = "gpt2"
|
16 |
+
ppl_tokenizer = AutoTokenizer.from_pretrained(ppl_model_name)
|
17 |
+
ppl_model = AutoModelForCausalLM.from_pretrained(ppl_model_name)
|
18 |
+
|
19 |
+
class InputText(BaseModel):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
text: str
|
21 |
|
22 |
+
def get_classifier_score(text: str) -> float:
|
23 |
+
inputs = clf_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
|
24 |
+
with torch.no_grad():
|
25 |
+
outputs = clf_model(**inputs)
|
26 |
+
probs = torch.softmax(outputs.logits, dim=-1)
|
27 |
+
ai_prob = probs[0][1].item() # label 1 = AI
|
28 |
+
return ai_prob
|
29 |
+
|
30 |
+
def get_perplexity(text: str) -> float:
|
31 |
+
encodings = ppl_tokenizer(text, return_tensors="pt")
|
32 |
+
max_length = ppl_model.config.n_positions
|
33 |
+
stride = 512
|
34 |
+
seq_len = encodings.input_ids.size(1)
|
35 |
+
|
36 |
+
nlls = []
|
37 |
+
prev_end_loc = 0
|
38 |
+
for begin_loc in range(0, seq_len, stride):
|
39 |
+
end_loc = min(begin_loc + stride, seq_len)
|
40 |
+
trg_len = end_loc - prev_end_loc
|
41 |
+
input_ids = encodings.input_ids[:, begin_loc:end_loc]
|
42 |
+
target_ids = input_ids.clone()
|
43 |
+
target_ids[:, :-trg_len] = -100
|
44 |
+
|
45 |
+
with torch.no_grad():
|
46 |
+
outputs = ppl_model(input_ids, labels=target_ids)
|
47 |
+
neg_log_likelihood = outputs.loss * trg_len
|
48 |
+
|
49 |
+
nlls.append(neg_log_likelihood)
|
50 |
+
prev_end_loc = end_loc
|
51 |
+
|
52 |
+
if end_loc == seq_len:
|
53 |
+
break
|
54 |
+
|
55 |
+
ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
|
56 |
+
return ppl.item()
|
57 |
+
|
58 |
+
@app.post("/detect")
|
59 |
+
def detect(input_text: InputText):
|
60 |
+
text = input_text.text.strip()
|
61 |
+
|
62 |
+
# Run classifier
|
63 |
+
clf_score = get_classifier_score(text)
|
64 |
+
|
65 |
+
# Run perplexity
|
66 |
+
ppl = get_perplexity(text)
|
67 |
+
|
68 |
+
# Decision rule: combine both
|
69 |
+
# Lower perplexity (<50) + high classifier_score (>0.7) = AI
|
70 |
+
if clf_score > 0.7 and ppl < 50:
|
71 |
+
final = "AI"
|
72 |
+
elif clf_score < 0.3 and ppl > 80:
|
73 |
+
final = "Human"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
else:
|
75 |
+
final = "Uncertain"
|
76 |
+
|
77 |
+
return {
|
78 |
+
"classifier_score": round(clf_score, 4),
|
79 |
+
"perplexity": round(ppl, 2),
|
80 |
+
"final_label": final
|
81 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -3,4 +3,5 @@ torch==2.4.1
|
|
3 |
fastapi==0.115.2
|
4 |
uvicorn==0.32.0
|
5 |
pydantic==2.9.2
|
6 |
-
numpy==2.0.2
|
|
|
|
3 |
fastapi==0.115.2
|
4 |
uvicorn==0.32.0
|
5 |
pydantic==2.9.2
|
6 |
+
numpy==2.0.2
|
7 |
+
scipy==1.14.1
|