File size: 4,656 Bytes
103d063
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f83098d
103d063
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import gradio as gr
import requests
import pytesseract
from PIL import Image
import os
import docx
from transformers import pipeline
from keybert import KeyBERT
from io import BytesIO

# Set up Tesseract (ensure Tesseract is installed on your system)
# For Windows, specify the Tesseract path if needed:
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Set up AI models
emotion_classifier = pipeline("text-classification", model="joeddav/distilbert-base-uncased-go-emotions-student")
keyword_extractor = KeyBERT()
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Function to extract text from image using Tesseract
def extract_text_from_image(image_url):
    response = requests.get(image_url)
    image = Image.open(BytesIO(response.content))
    text = pytesseract.image_to_string(image)
    return text.strip()

# Function to extract posts from social media (placeholder for actual scraping logic)
def extract_posts(profile_url, hashtags, num_posts):
    # Placeholder for actual scraping logic
    posts = [
        {
            "caption": "This is a sample post about climate change and environmental activism.",
            "image_url": "https://example.com/sample_image.jpg",
            "date": "2023-10-01",
            "likes": 100,
            "comments": 20,
        },
        {
            "caption": "Another post about technology and innovation in 2023.",
            "image_url": "",
            "date": "2023-10-02",
            "likes": 50,
            "comments": 10,
        },
    ]
    return posts[:num_posts]

# Function to categorize post using Zero-Shot Classification
def categorize_post(caption):
    categories = ["activism", "politics", "social issues", "technology", "environment", "health"]
    result = zero_shot_classifier(caption, candidate_labels=categories)
    return result["labels"][0]  # Return the most likely category

# Function to analyze detailed sentiment using RoBERTa-based emotion classifier
def analyze_sentiment(caption):
    emotions = emotion_classifier(caption, top_k=None)
    top_emotions = sorted(emotions, key=lambda x: x["score"], reverse=True)[:3]  # Top 3 emotions
    return top_emotions

# Function to process posts and generate Word document
def process_posts(profile_url, hashtags, num_posts):
    hashtags = [tag.strip() for tag in hashtags.split(",")]
    posts = extract_posts(profile_url, hashtags, num_posts)

    doc = docx.Document()
    doc.add_heading("Extracted Social Media Posts", 0)

    for i, post in enumerate(posts):
        doc.add_heading(f"Post {i+1}", level=1)
        doc.add_paragraph(f"Date: {post['date']}")
        doc.add_paragraph(f"Likes: {post['likes']}")
        doc.add_paragraph(f"Comments: {post['comments']}")
        doc.add_paragraph(f"Caption: {post['caption']}")

        # Extract text from image using Tesseract
        if post["image_url"]:
            extracted_text = extract_text_from_image(post["image_url"])
            doc.add_paragraph(f"Extracted Text from Image: {extracted_text}")

        # Detailed Sentiment Analysis using RoBERTa-based emotion classifier
        emotions = analyze_sentiment(post["caption"])
        emotion_text = ", ".join([f"{e['label']} ({e['score']:.2f})" for e in emotions])
        doc.add_paragraph(f"Top Emotions: {emotion_text}")

        # Keyword Extraction
        keywords = keyword_extractor.extract_keywords(post["caption"], keyphrase_ngram_range=(1, 2), stop_words="english")
        doc.add_paragraph(f"Extracted Keywords: {', '.join([kw[0] for kw in keywords])}")

        # Frame Extraction (Context/Category)
        category = categorize_post(post["caption"])
        doc.add_paragraph(f"Category/Frame: {category}")

    doc_path = "extracted_posts.docx"
    doc.save(doc_path)
    return doc_path

# Gradio Interface
def gradio_app(profile_url, hashtags, num_posts):
    try:
        doc_path = process_posts(profile_url, hashtags, num_posts)
        return doc_path
    except Exception as e:
        return f"Error: {str(e)}"

# Gradio UI
iface = gr.Interface(
    fn=gradio_app,
    inputs=[
        gr.Textbox(label="Social Media Profile URL"),
        gr.Textbox(label="Hashtags (comma-separated)"),
        gr.Number(label="Number of Posts to Extract", precision=0),
    ],
    outputs=gr.File(label="Download Extracted Posts"),
    title="Social Media Post Extractor",
    description="Extract posts from a social media profile based on hashtags. Includes OCR (Tesseract), detailed sentiment analysis (RoBERTa), keyword extraction, and frame (context/category) extraction.",
)

iface.launch()