Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -2,117 +2,202 @@ import gradio as gr
|
|
2 |
import requests
|
3 |
import pytesseract
|
4 |
from PIL import Image
|
5 |
-
import os
|
6 |
import docx
|
7 |
from transformers import pipeline
|
8 |
from keybert import KeyBERT
|
9 |
from io import BytesIO
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
emotion_classifier = pipeline("text-classification", model="joeddav/distilbert-base-uncased-go-emotions-student")
|
17 |
keyword_extractor = KeyBERT()
|
18 |
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
|
19 |
|
20 |
-
|
21 |
-
def
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
{
|
32 |
-
"caption": "
|
33 |
-
"image_url": "https://example.com/
|
34 |
"date": "2023-10-01",
|
35 |
"likes": 100,
|
36 |
"comments": 20,
|
37 |
-
}
|
38 |
-
{
|
39 |
-
"caption": "Another post about technology and innovation in 2023.",
|
40 |
-
"image_url": "",
|
41 |
-
"date": "2023-10-02",
|
42 |
-
"likes": 50,
|
43 |
-
"comments": 10,
|
44 |
-
},
|
45 |
]
|
46 |
-
return posts[:num_posts]
|
47 |
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
def categorize_post(caption):
|
50 |
categories = ["activism", "politics", "social issues", "technology", "environment", "health"]
|
51 |
result = zero_shot_classifier(caption, candidate_labels=categories)
|
52 |
-
return result["labels"][0]
|
53 |
|
54 |
-
# Function to analyze detailed sentiment using RoBERTa-based emotion classifier
|
55 |
def analyze_sentiment(caption):
|
56 |
emotions = emotion_classifier(caption, top_k=None)
|
57 |
-
|
58 |
-
return top_emotions
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
64 |
|
65 |
-
|
66 |
-
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
for i, post in enumerate(posts):
|
69 |
doc.add_heading(f"Post {i+1}", level=1)
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
#
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
#
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
|
|
|
|
104 |
|
105 |
-
# Gradio UI
|
106 |
iface = gr.Interface(
|
107 |
-
fn=
|
108 |
inputs=[
|
109 |
-
gr.Textbox(label="
|
110 |
-
gr.Textbox(label="Hashtags
|
111 |
-
gr.
|
112 |
],
|
113 |
-
outputs=gr.File(label="Download
|
114 |
-
title="Social Media
|
115 |
-
description="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
)
|
117 |
|
118 |
-
|
|
|
|
2 |
import requests
|
3 |
import pytesseract
|
4 |
from PIL import Image
|
|
|
5 |
import docx
|
6 |
from transformers import pipeline
|
7 |
from keybert import KeyBERT
|
8 |
from io import BytesIO
|
9 |
+
from langdetect import detect
|
10 |
+
import re
|
11 |
+
import asyncio
|
12 |
+
from twscrape import API, gather
|
13 |
+
from selenium.webdriver.chrome.options import Options
|
14 |
+
from selenium import webdriver
|
15 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
16 |
+
from bs4 import BeautifulSoup
|
17 |
+
import time
|
18 |
+
|
19 |
+
# Set up Tesseract
|
20 |
+
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Uncomment for Windows
|
21 |
+
|
22 |
+
# Initialize AI models
|
23 |
emotion_classifier = pipeline("text-classification", model="joeddav/distilbert-base-uncased-go-emotions-student")
|
24 |
keyword_extractor = KeyBERT()
|
25 |
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
|
26 |
|
27 |
+
class RealTimeSocialScraper:
|
28 |
+
def __init__(self):
|
29 |
+
self.api = API() # Configure proxies if needed
|
30 |
+
self.driver = self._init_browser()
|
31 |
+
|
32 |
+
def _init_browser(self):
|
33 |
+
chrome_options = Options()
|
34 |
+
chrome_options.add_argument("--headless")
|
35 |
+
chrome_options.add_argument("--disable-gpu")
|
36 |
+
return webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
|
37 |
+
|
38 |
+
async def scrape(self, platform, query, limit=10):
|
39 |
+
if platform == "twitter":
|
40 |
+
return await self._scrape_twitter(query, limit)
|
41 |
+
elif platform == "instagram":
|
42 |
+
return self._scrape_instagram(query)
|
43 |
+
elif platform == "tiktok":
|
44 |
+
return self._scrape_tiktok(query)
|
45 |
+
else:
|
46 |
+
raise ValueError(f"Unsupported platform: {platform}")
|
47 |
+
|
48 |
+
async def _scrape_twitter(self, query, limit):
|
49 |
+
await self.api.pool.login_all()
|
50 |
+
return await gather(self.api.search(query, limit=limit))
|
51 |
+
|
52 |
+
def _scrape_instagram(self, query):
|
53 |
+
self.driver.get(f"https://www.instagram.com/explore/tags/{query}/")
|
54 |
+
WebDriverWait(self.driver, 30).until(
|
55 |
+
EC.presence_of_element_located((By.CLASS_NAME, "v1Nh3"))
|
56 |
+
)
|
57 |
+
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
|
58 |
+
posts = []
|
59 |
+
for post in soup.findAll("div", class_="v1Nh3"):
|
60 |
+
posts.append({
|
61 |
+
'content': post.find('img')['alt'],
|
62 |
+
'image_url': post.find('img')['src']
|
63 |
+
})
|
64 |
+
return posts[:10]
|
65 |
+
|
66 |
+
def _scrape_tiktok(self, query):
|
67 |
+
# Implement TikTok scraping logic or use API
|
68 |
+
return [{"content": f"Demo TikTok post about {query}"}]
|
69 |
+
|
70 |
+
async def extract_posts(profile_url, hashtags, num_posts):
|
71 |
+
scraper = RealTimeSocialScraper()
|
72 |
+
platform = "twitter" if "twitter" in profile_url else "instagram"
|
73 |
+
|
74 |
+
try:
|
75 |
+
raw_posts = await scraper.scrape(platform, hashtags[0], num_posts)
|
76 |
+
return await _format_posts(raw_posts, platform)
|
77 |
+
except Exception as e:
|
78 |
+
print(f"Scraping failed: {e}")
|
79 |
+
return _fallback_data(num_posts)
|
80 |
+
|
81 |
+
def _format_posts(raw_posts, platform):
|
82 |
+
formatted = []
|
83 |
+
for post in raw_posts:
|
84 |
+
base_post = {
|
85 |
+
"caption": getattr(post, "rawContent", post.get('content', 'No caption')),
|
86 |
+
"image_url": getattr(post, "image_url", ""),
|
87 |
+
"video_url": "",
|
88 |
+
"audio_url": "",
|
89 |
+
"tagged_audience": [],
|
90 |
+
"date": str(time.strftime("%Y-%m-%d")),
|
91 |
+
"likes": getattr(post, "likeCount", 0),
|
92 |
+
"comments": getattr(post, "replyCount", 0)
|
93 |
+
}
|
94 |
+
formatted.append(base_post)
|
95 |
+
return formatted
|
96 |
+
|
97 |
+
def _fallback_data(num_posts):
|
98 |
+
return [
|
99 |
{
|
100 |
+
"caption": "Sample post about environmental issues",
|
101 |
+
"image_url": "https://example.com/sample.jpg",
|
102 |
"date": "2023-10-01",
|
103 |
"likes": 100,
|
104 |
"comments": 20,
|
105 |
+
} for _ in range(num_posts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
]
|
|
|
107 |
|
108 |
+
def extract_text_from_image(image_url):
|
109 |
+
try:
|
110 |
+
response = requests.get(image_url, timeout=10)
|
111 |
+
image = Image.open(BytesIO(response.content))
|
112 |
+
text = pytesseract.image_to_string(image)
|
113 |
+
return text.strip()
|
114 |
+
except Exception as e:
|
115 |
+
return f"OCR Error: {str(e)}"
|
116 |
+
|
117 |
def categorize_post(caption):
|
118 |
categories = ["activism", "politics", "social issues", "technology", "environment", "health"]
|
119 |
result = zero_shot_classifier(caption, candidate_labels=categories)
|
120 |
+
return result["labels"][0]
|
121 |
|
|
|
122 |
def analyze_sentiment(caption):
|
123 |
emotions = emotion_classifier(caption, top_k=None)
|
124 |
+
return sorted(emotions, key=lambda x: x["score"], reverse=True)[:3]
|
|
|
125 |
|
126 |
+
def detect_language(caption):
|
127 |
+
try:
|
128 |
+
return detect(caption)
|
129 |
+
except:
|
130 |
+
return "Unknown"
|
131 |
|
132 |
+
def extract_hashtags(caption):
|
133 |
+
return re.findall(r"#\w+", caption)
|
134 |
|
135 |
+
def process_posts(profile_url, hashtags, num_posts):
|
136 |
+
loop = asyncio.new_event_loop()
|
137 |
+
asyncio.set_event_loop(loop)
|
138 |
+
posts = loop.run_until_complete(extract_posts(profile_url, [h.strip() for h in hashtags.split(",")], num_posts))
|
139 |
+
|
140 |
+
doc = docx.Document()
|
141 |
+
doc.add_heading("Social Media Analysis Report", 0)
|
142 |
+
|
143 |
for i, post in enumerate(posts):
|
144 |
doc.add_heading(f"Post {i+1}", level=1)
|
145 |
+
|
146 |
+
# Metadata Section
|
147 |
+
meta = [
|
148 |
+
f"Date: {post.get('date', 'N/A')}",
|
149 |
+
f"Likes: {post.get('likes', 0)}",
|
150 |
+
f"Comments: {post.get('comments', 0)}",
|
151 |
+
f"Media: Pictures={1 if post['image_url'] else 0}, Videos={1 if post['video_url'] else 0}"
|
152 |
+
]
|
153 |
+
doc.add_paragraph("\n".join(meta))
|
154 |
+
|
155 |
+
# Content Analysis
|
156 |
+
content = doc.add_paragraph()
|
157 |
+
content.add_run("Caption Analysis:\n").bold = True
|
158 |
+
content.add_run(f"{post['caption']}\n\n")
|
159 |
+
|
160 |
+
# Sentiment and Language
|
161 |
+
content.add_run(f"Language: {detect_language(post['caption'])}\n")
|
162 |
+
emotions = analyze_sentiment(post['caption'])
|
163 |
+
content.add_run(f"Sentiment: {', '.join([f'{e['label']} ({e['score']:.2f})' for e in emotions])}\n")
|
164 |
+
|
165 |
+
# Hashtags and Category
|
166 |
+
hashtags = extract_hashtags(post['caption'])
|
167 |
+
content.add_run(f"Hashtags: {', '.join(hashtags) if hashtags else 'None'}\n")
|
168 |
+
content.add_run(f"Category: {categorize_post(post['caption'])}\n")
|
169 |
+
|
170 |
+
# Image Analysis
|
171 |
+
if post['image_url']:
|
172 |
+
img_analysis = doc.add_paragraph()
|
173 |
+
img_analysis.add_run("Image Analysis:\n").bold = True
|
174 |
+
img_analysis.add_run(f"Extracted Text: {extract_text_from_image(post['image_url'])[:500]}\n")
|
175 |
+
|
176 |
+
doc.add_page_break()
|
177 |
+
|
178 |
+
report_path = "social_media_analysis.docx"
|
179 |
+
doc.save(report_path)
|
180 |
+
return report_path
|
181 |
|
|
|
182 |
iface = gr.Interface(
|
183 |
+
fn=process_posts,
|
184 |
inputs=[
|
185 |
+
gr.Textbox(label="Profile URL", placeholder="Enter social media profile URL"),
|
186 |
+
gr.Textbox(label="Hashtags", placeholder="Comma-separated hashtags"),
|
187 |
+
gr.Slider(1, 50, value=5, label="Posts to Analyze")
|
188 |
],
|
189 |
+
outputs=gr.File(label="Download Report"),
|
190 |
+
title="Social Media Intelligence Analyzer",
|
191 |
+
description="""Real-time social media analysis with:
|
192 |
+
- 🕵️♂️ Live scraping
|
193 |
+
- 📊 Sentiment analysis
|
194 |
+
- 🖼️ Image OCR
|
195 |
+
- 🏷️ Hashtag tracking""",
|
196 |
+
examples=[
|
197 |
+
["https://twitter.com/eco_news", "climate, environment", 3],
|
198 |
+
["https://instagram.com/tech_innovators", "technology, future", 2]
|
199 |
+
]
|
200 |
)
|
201 |
|
202 |
+
if __name__ == "__main__":
|
203 |
+
iface.launch()
|