ahm14 commited on
Commit
103d063
·
verified ·
1 Parent(s): f24a1ab

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -0
app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import pytesseract
4
+ from PIL import Image
5
+ import os
6
+ import docx
7
+ from transformers import pipeline
8
+ from keybert import KeyBERT
9
+ from io import BytesIO
10
+
11
+ # Set up Tesseract (ensure Tesseract is installed on your system)
12
+ # For Windows, specify the Tesseract path if needed:
13
+ # pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
14
+
15
+ # Set up AI models
16
+ emotion_classifier = pipeline("text-classification", model="joeddav/distilbert-base-uncased-go-emotions")
17
+ keyword_extractor = KeyBERT()
18
+ zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
19
+
20
+ # Function to extract text from image using Tesseract
21
+ def extract_text_from_image(image_url):
22
+ response = requests.get(image_url)
23
+ image = Image.open(BytesIO(response.content))
24
+ text = pytesseract.image_to_string(image)
25
+ return text.strip()
26
+
27
+ # Function to extract posts from social media (placeholder for actual scraping logic)
28
+ def extract_posts(profile_url, hashtags, num_posts):
29
+ # Placeholder for actual scraping logic
30
+ posts = [
31
+ {
32
+ "caption": "This is a sample post about climate change and environmental activism.",
33
+ "image_url": "https://example.com/sample_image.jpg",
34
+ "date": "2023-10-01",
35
+ "likes": 100,
36
+ "comments": 20,
37
+ },
38
+ {
39
+ "caption": "Another post about technology and innovation in 2023.",
40
+ "image_url": "",
41
+ "date": "2023-10-02",
42
+ "likes": 50,
43
+ "comments": 10,
44
+ },
45
+ ]
46
+ return posts[:num_posts]
47
+
48
+ # Function to categorize post using Zero-Shot Classification
49
+ def categorize_post(caption):
50
+ categories = ["activism", "politics", "social issues", "technology", "environment", "health"]
51
+ result = zero_shot_classifier(caption, candidate_labels=categories)
52
+ return result["labels"][0] # Return the most likely category
53
+
54
+ # Function to analyze detailed sentiment using RoBERTa-based emotion classifier
55
+ def analyze_sentiment(caption):
56
+ emotions = emotion_classifier(caption, top_k=None)
57
+ top_emotions = sorted(emotions, key=lambda x: x["score"], reverse=True)[:3] # Top 3 emotions
58
+ return top_emotions
59
+
60
+ # Function to process posts and generate Word document
61
+ def process_posts(profile_url, hashtags, num_posts):
62
+ hashtags = [tag.strip() for tag in hashtags.split(",")]
63
+ posts = extract_posts(profile_url, hashtags, num_posts)
64
+
65
+ doc = docx.Document()
66
+ doc.add_heading("Extracted Social Media Posts", 0)
67
+
68
+ for i, post in enumerate(posts):
69
+ doc.add_heading(f"Post {i+1}", level=1)
70
+ doc.add_paragraph(f"Date: {post['date']}")
71
+ doc.add_paragraph(f"Likes: {post['likes']}")
72
+ doc.add_paragraph(f"Comments: {post['comments']}")
73
+ doc.add_paragraph(f"Caption: {post['caption']}")
74
+
75
+ # Extract text from image using Tesseract
76
+ if post["image_url"]:
77
+ extracted_text = extract_text_from_image(post["image_url"])
78
+ doc.add_paragraph(f"Extracted Text from Image: {extracted_text}")
79
+
80
+ # Detailed Sentiment Analysis using RoBERTa-based emotion classifier
81
+ emotions = analyze_sentiment(post["caption"])
82
+ emotion_text = ", ".join([f"{e['label']} ({e['score']:.2f})" for e in emotions])
83
+ doc.add_paragraph(f"Top Emotions: {emotion_text}")
84
+
85
+ # Keyword Extraction
86
+ keywords = keyword_extractor.extract_keywords(post["caption"], keyphrase_ngram_range=(1, 2), stop_words="english")
87
+ doc.add_paragraph(f"Extracted Keywords: {', '.join([kw[0] for kw in keywords])}")
88
+
89
+ # Frame Extraction (Context/Category)
90
+ category = categorize_post(post["caption"])
91
+ doc.add_paragraph(f"Category/Frame: {category}")
92
+
93
+ doc_path = "extracted_posts.docx"
94
+ doc.save(doc_path)
95
+ return doc_path
96
+
97
+ # Gradio Interface
98
+ def gradio_app(profile_url, hashtags, num_posts):
99
+ try:
100
+ doc_path = process_posts(profile_url, hashtags, num_posts)
101
+ return doc_path
102
+ except Exception as e:
103
+ return f"Error: {str(e)}"
104
+
105
+ # Gradio UI
106
+ iface = gr.Interface(
107
+ fn=gradio_app,
108
+ inputs=[
109
+ gr.Textbox(label="Social Media Profile URL"),
110
+ gr.Textbox(label="Hashtags (comma-separated)"),
111
+ gr.Number(label="Number of Posts to Extract", precision=0),
112
+ ],
113
+ outputs=gr.File(label="Download Extracted Posts"),
114
+ title="Social Media Post Extractor",
115
+ description="Extract posts from a social media profile based on hashtags. Includes OCR (Tesseract), detailed sentiment analysis (RoBERTa), keyword extraction, and frame (context/category) extraction.",
116
+ )
117
+
118
+ iface.launch()