Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- Dockerfile +38 -0
- main.py +364 -0
- requirements.txt +10 -0
Dockerfile
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use Python 3.11 slim image
|
2 |
+
FROM python:3.11-slim
|
3 |
+
|
4 |
+
# Set working directory
|
5 |
+
WORKDIR /app
|
6 |
+
|
7 |
+
# Install system dependencies
|
8 |
+
RUN apt-get update && apt-get install -y \
|
9 |
+
gcc \
|
10 |
+
g++ \
|
11 |
+
curl \
|
12 |
+
&& rm -rf /var/lib/apt/lists/*
|
13 |
+
|
14 |
+
# Copy requirements first for better caching
|
15 |
+
COPY requirements.txt .
|
16 |
+
|
17 |
+
# Install Python dependencies
|
18 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
19 |
+
|
20 |
+
# Copy application code
|
21 |
+
COPY . .
|
22 |
+
|
23 |
+
# Create necessary directories
|
24 |
+
RUN mkdir -p /app/templates /app/static /tmp
|
25 |
+
|
26 |
+
# Set environment variables
|
27 |
+
ENV PYTHONPATH="/app"
|
28 |
+
ENV PYTHONUNBUFFERED=1
|
29 |
+
|
30 |
+
# Expose port
|
31 |
+
EXPOSE 7860
|
32 |
+
|
33 |
+
# Health check
|
34 |
+
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
35 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
36 |
+
|
37 |
+
# Run the application
|
38 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
main.py
ADDED
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException, BackgroundTasks
|
2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
3 |
+
from fastapi.staticfiles import StaticFiles
|
4 |
+
from fastapi.responses import HTMLResponse, FileResponse
|
5 |
+
from pydantic import BaseModel
|
6 |
+
from typing import List, Optional, Dict, Any
|
7 |
+
import uvicorn
|
8 |
+
import asyncio
|
9 |
+
import json
|
10 |
+
import os
|
11 |
+
from datetime import datetime
|
12 |
+
import logging
|
13 |
+
|
14 |
+
# Import your scrapers
|
15 |
+
from app3 import PhoneDBScraper, GSMArenaScraperAlternative
|
16 |
+
|
17 |
+
# Configure logging
|
18 |
+
logging.basicConfig(level=logging.INFO)
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
+
|
21 |
+
# Create FastAPI app
|
22 |
+
app = FastAPI(
|
23 |
+
title="Phone Specifications API",
|
24 |
+
description="API for scraping phone specifications from PhoneDB and GSMArena",
|
25 |
+
version="1.0.0"
|
26 |
+
)
|
27 |
+
|
28 |
+
# Add CORS middleware
|
29 |
+
app.add_middleware(
|
30 |
+
CORSMiddleware,
|
31 |
+
allow_origins=["*"], # In production, specify allowed origins
|
32 |
+
allow_credentials=True,
|
33 |
+
allow_methods=["*"],
|
34 |
+
allow_headers=["*"],
|
35 |
+
)
|
36 |
+
|
37 |
+
# Mount static files for frontend
|
38 |
+
if os.path.exists("static"):
|
39 |
+
app.mount("/static", StaticFiles(directory="static"), name="static")
|
40 |
+
|
41 |
+
# Pydantic models
|
42 |
+
class PhoneSearchRequest(BaseModel):
|
43 |
+
phone_name: str
|
44 |
+
source: str = "gsmarena" # "phonedb" or "gsmarena"
|
45 |
+
|
46 |
+
class MultiplePhoneSearchRequest(BaseModel):
|
47 |
+
phone_names: List[str]
|
48 |
+
source: str = "gsmarena"
|
49 |
+
|
50 |
+
class PhoneSpecification(BaseModel):
|
51 |
+
name: str
|
52 |
+
brand: str
|
53 |
+
images: List[str]
|
54 |
+
specifications: Dict[str, Any]
|
55 |
+
source_url: str
|
56 |
+
|
57 |
+
class ApiResponse(BaseModel):
|
58 |
+
success: bool
|
59 |
+
message: str
|
60 |
+
data: Optional[Any] = None
|
61 |
+
timestamp: str = datetime.now().isoformat()
|
62 |
+
|
63 |
+
# Global scrapers
|
64 |
+
phonedb_scraper = None
|
65 |
+
gsmarena_scraper = None
|
66 |
+
|
67 |
+
@app.on_event("startup")
|
68 |
+
async def startup_event():
|
69 |
+
"""Initialize scrapers on startup"""
|
70 |
+
global phonedb_scraper, gsmarena_scraper
|
71 |
+
try:
|
72 |
+
phonedb_scraper = PhoneDBScraper()
|
73 |
+
gsmarena_scraper = GSMArenaScraperAlternative()
|
74 |
+
logger.info("Scrapers initialized successfully")
|
75 |
+
except Exception as e:
|
76 |
+
logger.error(f"Error initializing scrapers: {e}")
|
77 |
+
|
78 |
+
# Routes
|
79 |
+
@app.get("/", response_class=HTMLResponse)
|
80 |
+
async def read_root():
|
81 |
+
"""Serve the main HTML page"""
|
82 |
+
try:
|
83 |
+
with open("templates/index.html", "r", encoding="utf-8") as f:
|
84 |
+
return HTMLResponse(content=f.read())
|
85 |
+
except FileNotFoundError:
|
86 |
+
return HTMLResponse(content="""
|
87 |
+
<html>
|
88 |
+
<head><title>Phone Specs API</title></head>
|
89 |
+
<body>
|
90 |
+
<h1>Phone Specifications API</h1>
|
91 |
+
<p>API is running! Visit <a href="/docs">/docs</a> for API documentation.</p>
|
92 |
+
</body>
|
93 |
+
</html>
|
94 |
+
""")
|
95 |
+
|
96 |
+
@app.get("/health")
|
97 |
+
async def health_check():
|
98 |
+
"""Health check endpoint"""
|
99 |
+
return ApiResponse(
|
100 |
+
success=True,
|
101 |
+
message="API is healthy",
|
102 |
+
data={"status": "running", "scrapers": {"phonedb": phonedb_scraper is not None, "gsmarena": gsmarena_scraper is not None}}
|
103 |
+
)
|
104 |
+
|
105 |
+
@app.post("/api/search", response_model=ApiResponse)
|
106 |
+
async def search_phone(request: PhoneSearchRequest):
|
107 |
+
"""Search for a single phone"""
|
108 |
+
try:
|
109 |
+
logger.info(f"Searching for phone: {request.phone_name} using {request.source}")
|
110 |
+
|
111 |
+
# Choose scraper based on source
|
112 |
+
if request.source.lower() == "phonedb" and phonedb_scraper:
|
113 |
+
scraper = phonedb_scraper
|
114 |
+
elif request.source.lower() == "gsmarena" and gsmarena_scraper:
|
115 |
+
scraper = gsmarena_scraper
|
116 |
+
else:
|
117 |
+
# Default to GSMArena if available
|
118 |
+
if gsmarena_scraper:
|
119 |
+
scraper = gsmarena_scraper
|
120 |
+
elif phonedb_scraper:
|
121 |
+
scraper = phonedb_scraper
|
122 |
+
else:
|
123 |
+
raise HTTPException(status_code=503, detail="No scrapers available")
|
124 |
+
|
125 |
+
# Run scraping in background to avoid blocking
|
126 |
+
loop = asyncio.get_event_loop()
|
127 |
+
result = await loop.run_in_executor(
|
128 |
+
None,
|
129 |
+
scraper.scrape_phone_by_name,
|
130 |
+
request.phone_name
|
131 |
+
)
|
132 |
+
|
133 |
+
if result:
|
134 |
+
return ApiResponse(
|
135 |
+
success=True,
|
136 |
+
message=f"Successfully found specifications for {result['name']}",
|
137 |
+
data=result
|
138 |
+
)
|
139 |
+
else:
|
140 |
+
return ApiResponse(
|
141 |
+
success=False,
|
142 |
+
message=f"No results found for {request.phone_name}",
|
143 |
+
data=None
|
144 |
+
)
|
145 |
+
|
146 |
+
except Exception as e:
|
147 |
+
logger.error(f"Error searching for phone {request.phone_name}: {e}")
|
148 |
+
raise HTTPException(status_code=500, detail=str(e))
|
149 |
+
|
150 |
+
@app.post("/api/search/multiple", response_model=ApiResponse)
|
151 |
+
async def search_multiple_phones(request: MultiplePhoneSearchRequest):
|
152 |
+
"""Search for multiple phones"""
|
153 |
+
try:
|
154 |
+
logger.info(f"Searching for {len(request.phone_names)} phones using {request.source}")
|
155 |
+
|
156 |
+
# Choose scraper
|
157 |
+
if request.source.lower() == "phonedb" and phonedb_scraper:
|
158 |
+
scraper = phonedb_scraper
|
159 |
+
elif request.source.lower() == "gsmarena" and gsmarena_scraper:
|
160 |
+
scraper = gsmarena_scraper
|
161 |
+
else:
|
162 |
+
if gsmarena_scraper:
|
163 |
+
scraper = gsmarena_scraper
|
164 |
+
elif phonedb_scraper:
|
165 |
+
scraper = phonedb_scraper
|
166 |
+
else:
|
167 |
+
raise HTTPException(status_code=503, detail="No scrapers available")
|
168 |
+
|
169 |
+
# Run scraping in background
|
170 |
+
loop = asyncio.get_event_loop()
|
171 |
+
results = await loop.run_in_executor(
|
172 |
+
None,
|
173 |
+
scraper.scrape_multiple_phones,
|
174 |
+
request.phone_names
|
175 |
+
)
|
176 |
+
|
177 |
+
success_count = len(results) if results else 0
|
178 |
+
total_count = len(request.phone_names)
|
179 |
+
|
180 |
+
return ApiResponse(
|
181 |
+
success=success_count > 0,
|
182 |
+
message=f"Successfully scraped {success_count}/{total_count} phones",
|
183 |
+
data={
|
184 |
+
"phones": results,
|
185 |
+
"success_count": success_count,
|
186 |
+
"total_count": total_count
|
187 |
+
}
|
188 |
+
)
|
189 |
+
|
190 |
+
except Exception as e:
|
191 |
+
logger.error(f"Error searching for multiple phones: {e}")
|
192 |
+
raise HTTPException(status_code=500, detail=str(e))
|
193 |
+
|
194 |
+
@app.get("/api/sources")
|
195 |
+
async def get_available_sources():
|
196 |
+
"""Get available scraping sources"""
|
197 |
+
sources = []
|
198 |
+
|
199 |
+
if phonedb_scraper:
|
200 |
+
sources.append({
|
201 |
+
"id": "phonedb",
|
202 |
+
"name": "PhoneDB",
|
203 |
+
"description": "PhoneDB.net database",
|
204 |
+
"available": True
|
205 |
+
})
|
206 |
+
|
207 |
+
if gsmarena_scraper:
|
208 |
+
sources.append({
|
209 |
+
"id": "gsmarena",
|
210 |
+
"name": "GSMArena",
|
211 |
+
"description": "GSMArena.com database",
|
212 |
+
"available": True
|
213 |
+
})
|
214 |
+
|
215 |
+
return ApiResponse(
|
216 |
+
success=True,
|
217 |
+
message="Available sources retrieved",
|
218 |
+
data=sources
|
219 |
+
)
|
220 |
+
|
221 |
+
@app.get("/api/export/{phone_name}")
|
222 |
+
async def export_phone_data(phone_name: str, source: str = "gsmarena"):
|
223 |
+
"""Export phone data as JSON file"""
|
224 |
+
try:
|
225 |
+
# Choose scraper
|
226 |
+
if source.lower() == "phonedb" and phonedb_scraper:
|
227 |
+
scraper = phonedb_scraper
|
228 |
+
else:
|
229 |
+
scraper = gsmarena_scraper
|
230 |
+
|
231 |
+
if not scraper:
|
232 |
+
raise HTTPException(status_code=503, detail="Scraper not available")
|
233 |
+
|
234 |
+
# Get phone data
|
235 |
+
loop = asyncio.get_event_loop()
|
236 |
+
result = await loop.run_in_executor(
|
237 |
+
None,
|
238 |
+
scraper.scrape_phone_by_name,
|
239 |
+
phone_name
|
240 |
+
)
|
241 |
+
|
242 |
+
if not result:
|
243 |
+
raise HTTPException(status_code=404, detail="Phone not found")
|
244 |
+
|
245 |
+
# Create temporary file
|
246 |
+
filename = f"{phone_name.replace(' ', '_')}_specs.json"
|
247 |
+
filepath = f"/tmp/{filename}"
|
248 |
+
|
249 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
250 |
+
json.dump(result, f, indent=2, ensure_ascii=False)
|
251 |
+
|
252 |
+
return FileResponse(
|
253 |
+
filepath,
|
254 |
+
media_type='application/json',
|
255 |
+
filename=filename
|
256 |
+
)
|
257 |
+
|
258 |
+
except Exception as e:
|
259 |
+
logger.error(f"Error exporting phone data: {e}")
|
260 |
+
raise HTTPException(status_code=500, detail=str(e))
|
261 |
+
|
262 |
+
# Background tasks for long-running scraping jobs
|
263 |
+
background_jobs = {}
|
264 |
+
|
265 |
+
@app.post("/api/scrape/background")
|
266 |
+
async def start_background_scraping(request: MultiplePhoneSearchRequest, background_tasks: BackgroundTasks):
|
267 |
+
"""Start background scraping job for multiple phones"""
|
268 |
+
job_id = f"job_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
269 |
+
|
270 |
+
# Initialize job status
|
271 |
+
background_jobs[job_id] = {
|
272 |
+
"status": "started",
|
273 |
+
"progress": 0,
|
274 |
+
"total": len(request.phone_names),
|
275 |
+
"results": [],
|
276 |
+
"started_at": datetime.now().isoformat()
|
277 |
+
}
|
278 |
+
|
279 |
+
# Add background task
|
280 |
+
background_tasks.add_task(
|
281 |
+
run_background_scraping,
|
282 |
+
job_id,
|
283 |
+
request.phone_names,
|
284 |
+
request.source
|
285 |
+
)
|
286 |
+
|
287 |
+
return ApiResponse(
|
288 |
+
success=True,
|
289 |
+
message="Background scraping job started",
|
290 |
+
data={"job_id": job_id}
|
291 |
+
)
|
292 |
+
|
293 |
+
@app.get("/api/scrape/status/{job_id}")
|
294 |
+
async def get_scraping_status(job_id: str):
|
295 |
+
"""Get status of background scraping job"""
|
296 |
+
if job_id not in background_jobs:
|
297 |
+
raise HTTPException(status_code=404, detail="Job not found")
|
298 |
+
|
299 |
+
return ApiResponse(
|
300 |
+
success=True,
|
301 |
+
message="Job status retrieved",
|
302 |
+
data=background_jobs[job_id]
|
303 |
+
)
|
304 |
+
|
305 |
+
async def run_background_scraping(job_id: str, phone_names: List[str], source: str):
|
306 |
+
"""Run background scraping job"""
|
307 |
+
try:
|
308 |
+
# Choose scraper
|
309 |
+
if source.lower() == "phonedb" and phonedb_scraper:
|
310 |
+
scraper = phonedb_scraper
|
311 |
+
else:
|
312 |
+
scraper = gsmarena_scraper
|
313 |
+
|
314 |
+
if not scraper:
|
315 |
+
background_jobs[job_id]["status"] = "failed"
|
316 |
+
background_jobs[job_id]["error"] = "Scraper not available"
|
317 |
+
return
|
318 |
+
|
319 |
+
background_jobs[job_id]["status"] = "running"
|
320 |
+
results = []
|
321 |
+
|
322 |
+
for i, phone_name in enumerate(phone_names):
|
323 |
+
try:
|
324 |
+
# Update progress
|
325 |
+
background_jobs[job_id]["progress"] = i
|
326 |
+
background_jobs[job_id]["current_phone"] = phone_name
|
327 |
+
|
328 |
+
# Scrape phone
|
329 |
+
loop = asyncio.get_event_loop()
|
330 |
+
result = await loop.run_in_executor(
|
331 |
+
None,
|
332 |
+
scraper.scrape_phone_by_name,
|
333 |
+
phone_name
|
334 |
+
)
|
335 |
+
|
336 |
+
if result:
|
337 |
+
results.append(result)
|
338 |
+
|
339 |
+
# Add delay between requests
|
340 |
+
await asyncio.sleep(2)
|
341 |
+
|
342 |
+
except Exception as e:
|
343 |
+
logger.error(f"Error scraping {phone_name} in background job: {e}")
|
344 |
+
continue
|
345 |
+
|
346 |
+
# Update job status
|
347 |
+
background_jobs[job_id]["status"] = "completed"
|
348 |
+
background_jobs[job_id]["progress"] = len(phone_names)
|
349 |
+
background_jobs[job_id]["results"] = results
|
350 |
+
background_jobs[job_id]["completed_at"] = datetime.now().isoformat()
|
351 |
+
|
352 |
+
except Exception as e:
|
353 |
+
background_jobs[job_id]["status"] = "failed"
|
354 |
+
background_jobs[job_id]["error"] = str(e)
|
355 |
+
logger.error(f"Background job {job_id} failed: {e}")
|
356 |
+
|
357 |
+
if __name__ == "__main__":
|
358 |
+
uvicorn.run(
|
359 |
+
"main:app",
|
360 |
+
host="0.0.0.0",
|
361 |
+
port=7860,
|
362 |
+
reload=True,
|
363 |
+
log_level="info"
|
364 |
+
)
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi==0.104.1
|
2 |
+
uvicorn[standard]==0.24.0
|
3 |
+
requests==2.31.0
|
4 |
+
beautifulsoup4==4.12.2
|
5 |
+
lxml==4.9.3
|
6 |
+
urllib3==2.0.7
|
7 |
+
pydantic==2.5.0
|
8 |
+
python-multipart==0.0.6
|
9 |
+
jinja2==3.1.2
|
10 |
+
aiofiles==23.2.1
|