Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import requests
|
3 |
+
import pytesseract
|
4 |
+
from PIL import Image
|
5 |
+
import os
|
6 |
+
import docx
|
7 |
+
from transformers import pipeline
|
8 |
+
from keybert import KeyBERT
|
9 |
+
from io import BytesIO
|
10 |
+
|
11 |
+
# Set up Tesseract (ensure Tesseract is installed on your system)
|
12 |
+
# For Windows, specify the Tesseract path if needed:
|
13 |
+
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
14 |
+
|
15 |
+
# Set up AI models
|
16 |
+
emotion_classifier = pipeline("text-classification", model="joeddav/distilbert-base-uncased-go-emotions")
|
17 |
+
keyword_extractor = KeyBERT()
|
18 |
+
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
|
19 |
+
|
20 |
+
# Function to extract text from image using Tesseract
|
21 |
+
def extract_text_from_image(image_url):
|
22 |
+
response = requests.get(image_url)
|
23 |
+
image = Image.open(BytesIO(response.content))
|
24 |
+
text = pytesseract.image_to_string(image)
|
25 |
+
return text.strip()
|
26 |
+
|
27 |
+
# Function to extract posts from social media (placeholder for actual scraping logic)
|
28 |
+
def extract_posts(profile_url, hashtags, num_posts):
|
29 |
+
# Placeholder for actual scraping logic
|
30 |
+
posts = [
|
31 |
+
{
|
32 |
+
"caption": "This is a sample post about climate change and environmental activism.",
|
33 |
+
"image_url": "https://example.com/sample_image.jpg",
|
34 |
+
"date": "2023-10-01",
|
35 |
+
"likes": 100,
|
36 |
+
"comments": 20,
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"caption": "Another post about technology and innovation in 2023.",
|
40 |
+
"image_url": "",
|
41 |
+
"date": "2023-10-02",
|
42 |
+
"likes": 50,
|
43 |
+
"comments": 10,
|
44 |
+
},
|
45 |
+
]
|
46 |
+
return posts[:num_posts]
|
47 |
+
|
48 |
+
# Function to categorize post using Zero-Shot Classification
|
49 |
+
def categorize_post(caption):
|
50 |
+
categories = ["activism", "politics", "social issues", "technology", "environment", "health"]
|
51 |
+
result = zero_shot_classifier(caption, candidate_labels=categories)
|
52 |
+
return result["labels"][0] # Return the most likely category
|
53 |
+
|
54 |
+
# Function to analyze detailed sentiment using RoBERTa-based emotion classifier
|
55 |
+
def analyze_sentiment(caption):
|
56 |
+
emotions = emotion_classifier(caption, top_k=None)
|
57 |
+
top_emotions = sorted(emotions, key=lambda x: x["score"], reverse=True)[:3] # Top 3 emotions
|
58 |
+
return top_emotions
|
59 |
+
|
60 |
+
# Function to process posts and generate Word document
|
61 |
+
def process_posts(profile_url, hashtags, num_posts):
|
62 |
+
hashtags = [tag.strip() for tag in hashtags.split(",")]
|
63 |
+
posts = extract_posts(profile_url, hashtags, num_posts)
|
64 |
+
|
65 |
+
doc = docx.Document()
|
66 |
+
doc.add_heading("Extracted Social Media Posts", 0)
|
67 |
+
|
68 |
+
for i, post in enumerate(posts):
|
69 |
+
doc.add_heading(f"Post {i+1}", level=1)
|
70 |
+
doc.add_paragraph(f"Date: {post['date']}")
|
71 |
+
doc.add_paragraph(f"Likes: {post['likes']}")
|
72 |
+
doc.add_paragraph(f"Comments: {post['comments']}")
|
73 |
+
doc.add_paragraph(f"Caption: {post['caption']}")
|
74 |
+
|
75 |
+
# Extract text from image using Tesseract
|
76 |
+
if post["image_url"]:
|
77 |
+
extracted_text = extract_text_from_image(post["image_url"])
|
78 |
+
doc.add_paragraph(f"Extracted Text from Image: {extracted_text}")
|
79 |
+
|
80 |
+
# Detailed Sentiment Analysis using RoBERTa-based emotion classifier
|
81 |
+
emotions = analyze_sentiment(post["caption"])
|
82 |
+
emotion_text = ", ".join([f"{e['label']} ({e['score']:.2f})" for e in emotions])
|
83 |
+
doc.add_paragraph(f"Top Emotions: {emotion_text}")
|
84 |
+
|
85 |
+
# Keyword Extraction
|
86 |
+
keywords = keyword_extractor.extract_keywords(post["caption"], keyphrase_ngram_range=(1, 2), stop_words="english")
|
87 |
+
doc.add_paragraph(f"Extracted Keywords: {', '.join([kw[0] for kw in keywords])}")
|
88 |
+
|
89 |
+
# Frame Extraction (Context/Category)
|
90 |
+
category = categorize_post(post["caption"])
|
91 |
+
doc.add_paragraph(f"Category/Frame: {category}")
|
92 |
+
|
93 |
+
doc_path = "extracted_posts.docx"
|
94 |
+
doc.save(doc_path)
|
95 |
+
return doc_path
|
96 |
+
|
97 |
+
# Gradio Interface
|
98 |
+
def gradio_app(profile_url, hashtags, num_posts):
|
99 |
+
try:
|
100 |
+
doc_path = process_posts(profile_url, hashtags, num_posts)
|
101 |
+
return doc_path
|
102 |
+
except Exception as e:
|
103 |
+
return f"Error: {str(e)}"
|
104 |
+
|
105 |
+
# Gradio UI
|
106 |
+
iface = gr.Interface(
|
107 |
+
fn=gradio_app,
|
108 |
+
inputs=[
|
109 |
+
gr.Textbox(label="Social Media Profile URL"),
|
110 |
+
gr.Textbox(label="Hashtags (comma-separated)"),
|
111 |
+
gr.Number(label="Number of Posts to Extract", precision=0),
|
112 |
+
],
|
113 |
+
outputs=gr.File(label="Download Extracted Posts"),
|
114 |
+
title="Social Media Post Extractor",
|
115 |
+
description="Extract posts from a social media profile based on hashtags. Includes OCR (Tesseract), detailed sentiment analysis (RoBERTa), keyword extraction, and frame (context/category) extraction.",
|
116 |
+
)
|
117 |
+
|
118 |
+
iface.launch()
|