Spaces:

ahm14
/

Post_Extractor

Runtime error

App Files Files Community

Post_Extractor / app.py

ahm14

Update app.py

f83098d verified 7 months ago

raw

history blame

4.66 kB

	import gradio as gr
	import requests
	import pytesseract
	from PIL import Image
	import os
	import docx
	from transformers import pipeline
	from keybert import KeyBERT
	from io import BytesIO

	# Set up Tesseract (ensure Tesseract is installed on your system)
	# For Windows, specify the Tesseract path if needed:
	# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

	# Set up AI models
	emotion_classifier = pipeline("text-classification", model="joeddav/distilbert-base-uncased-go-emotions-student")
	keyword_extractor = KeyBERT()
	zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

	# Function to extract text from image using Tesseract
	def extract_text_from_image(image_url):
	response = requests.get(image_url)
	image = Image.open(BytesIO(response.content))
	text = pytesseract.image_to_string(image)
	return text.strip()

	# Function to extract posts from social media (placeholder for actual scraping logic)
	def extract_posts(profile_url, hashtags, num_posts):
	# Placeholder for actual scraping logic
	posts = [
	{
	"caption": "This is a sample post about climate change and environmental activism.",
	"image_url": "https://example.com/sample_image.jpg",
	"date": "2023-10-01",
	"likes": 100,
	"comments": 20,
	},
	{
	"caption": "Another post about technology and innovation in 2023.",
	"image_url": "",
	"date": "2023-10-02",
	"likes": 50,
	"comments": 10,
	},
	]
	return posts[:num_posts]

	# Function to categorize post using Zero-Shot Classification
	def categorize_post(caption):
	categories = ["activism", "politics", "social issues", "technology", "environment", "health"]
	result = zero_shot_classifier(caption, candidate_labels=categories)
	return result["labels"][0] # Return the most likely category

	# Function to analyze detailed sentiment using RoBERTa-based emotion classifier
	def analyze_sentiment(caption):
	emotions = emotion_classifier(caption, top_k=None)
	top_emotions = sorted(emotions, key=lambda x: x["score"], reverse=True)[:3] # Top 3 emotions
	return top_emotions

	# Function to process posts and generate Word document
	def process_posts(profile_url, hashtags, num_posts):
	hashtags = [tag.strip() for tag in hashtags.split(",")]
	posts = extract_posts(profile_url, hashtags, num_posts)

	doc = docx.Document()
	doc.add_heading("Extracted Social Media Posts", 0)

	for i, post in enumerate(posts):
	doc.add_heading(f"Post {i+1}", level=1)
	doc.add_paragraph(f"Date: {post['date']}")
	doc.add_paragraph(f"Likes: {post['likes']}")
	doc.add_paragraph(f"Comments: {post['comments']}")
	doc.add_paragraph(f"Caption: {post['caption']}")

	# Extract text from image using Tesseract
	if post["image_url"]:
	extracted_text = extract_text_from_image(post["image_url"])
	doc.add_paragraph(f"Extracted Text from Image: {extracted_text}")

	# Detailed Sentiment Analysis using RoBERTa-based emotion classifier
	emotions = analyze_sentiment(post["caption"])
	emotion_text = ", ".join([f"{e['label']} ({e['score']:.2f})" for e in emotions])
	doc.add_paragraph(f"Top Emotions: {emotion_text}")

	# Keyword Extraction
	keywords = keyword_extractor.extract_keywords(post["caption"], keyphrase_ngram_range=(1, 2), stop_words="english")
	doc.add_paragraph(f"Extracted Keywords: {', '.join([kw[0] for kw in keywords])}")

	# Frame Extraction (Context/Category)
	category = categorize_post(post["caption"])
	doc.add_paragraph(f"Category/Frame: {category}")

	doc_path = "extracted_posts.docx"
	doc.save(doc_path)
	return doc_path

	# Gradio Interface
	def gradio_app(profile_url, hashtags, num_posts):
	try:
	doc_path = process_posts(profile_url, hashtags, num_posts)
	return doc_path
	except Exception as e:
	return f"Error: {str(e)}"

	# Gradio UI
	iface = gr.Interface(
	fn=gradio_app,
	inputs=[
	gr.Textbox(label="Social Media Profile URL"),
	gr.Textbox(label="Hashtags (comma-separated)"),
	gr.Number(label="Number of Posts to Extract", precision=0),
	],
	outputs=gr.File(label="Download Extracted Posts"),
	title="Social Media Post Extractor",
	description="Extract posts from a social media profile based on hashtags. Includes OCR (Tesseract), detailed sentiment analysis (RoBERTa), keyword extraction, and frame (context/category) extraction.",
	)

	iface.launch()