import streamlit as st import pandas as pd from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, pipeline from bertopic import BERTopic import torch import numpy as np from collections import Counter # Load AraBERT tokenizer and model for embeddings bert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2") bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2") # Load AraBERT model for emotion classification emotion_model = AutoModelForSequenceClassification.from_pretrained("aubmindlab/bert-base-arabertv2") emotion_classifier = pipeline("text-classification", model=emotion_model, tokenizer=bert_tokenizer) # Function to generate embeddings using AraBERT def generate_embeddings(texts): all_embeddings = [] for text in texts: # Tokenize with truncation to handle long sequences inputs = bert_tokenizer( text, return_tensors="pt", padding=True, truncation=True, max_length=512 ) # Generate embeddings with torch.no_grad(): outputs = bert_model(**inputs) # Get the mean of the last hidden state as the embedding embedding = outputs.last_hidden_state.mean(dim=1).numpy() all_embeddings.append(embedding[0]) # Remove batch dimension return np.array(all_embeddings) # Function to perform emotion classification with proper truncation def classify_emotions(texts): emotions = [] for text in texts: # Process text in chunks if it's too long if len(bert_tokenizer.encode(text)) > 512: chunks = [text[i:i + 512] for i in range(0, len(text), 512)] # Take the emotion of the first chunk (usually contains the most relevant information) emotion = emotion_classifier(chunks[0])[0]['label'] else: emotion = emotion_classifier(text)[0]['label'] emotions.append(emotion) return emotions # Function to process the uploaded file and summarize by country def process_and_summarize(uploaded_file, top_n=50): # Determine the file type if uploaded_file.name.endswith(".csv"): df = pd.read_csv(uploaded_file) elif uploaded_file.name.endswith(".xlsx"): df = pd.read_excel(uploaded_file) else: st.error("Unsupported file format.") return None, None # Validate required columns required_columns = ['country', 'poem'] missing_columns = [col for col in required_columns if col not in df.columns] if missing_columns: st.error(f"Missing columns: {', '.join(missing_columns)}") return None, None # Parse and preprocess the file df['country'] = df['country'].str.strip() df = df.dropna(subset=['country', 'poem']) # Initialize BERTopic topic_model = BERTopic(language="arabic") # Group by country summaries = [] for country, group in df.groupby('country'): st.info(f"Processing poems for {country}...") # Get texts for this country texts = group['poem'].dropna().tolist() # Classify emotions st.info(f"Classifying emotions for {country}...") emotions = classify_emotions(texts) # Generate embeddings and fit topic model st.info(f"Generating embeddings and topics for {country}...") embeddings = generate_embeddings(texts) try: topics, _ = topic_model.fit_transform(texts, embeddings) # Aggregate topics and emotions top_topics = Counter(topics).most_common(top_n) top_emotions = Counter(emotions).most_common(top_n) summaries.append({ 'country': country, 'total_poems': len(texts), 'top_topics': top_topics, 'top_emotions': top_emotions }) except Exception as e: st.warning(f"Could not generate topics for {country}: {str(e)}") continue return summaries, topic_model # Streamlit App Interface st.title("Arabic Poem Topic Modeling & Emotion Classification") st.write("Upload a CSV or Excel file containing Arabic poems with columns `country` and `poem`.") uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"]) if uploaded_file is not None: try: top_n = st.number_input("Select the number of top topics/emotions to display:", min_value=1, max_value=100, value=50) summaries, topic_model = process_and_summarize(uploaded_file, top_n=top_n) if summaries is not None: st.success("Data successfully processed!") # Display summary for each country for summary in summaries: st.write(f"### {summary['country']}") st.write(f"Total Poems: {summary['total_poems']}") st.write(f"Top {top_n} Topics:") st.write(summary['top_topics']) st.write(f"Top {top_n} Emotions:") st.write(summary['top_emotions']) # Display overall topics st.write("### Global Topic Information:") st.write(topic_model.get_topic_info()) except Exception as e: st.error(f"Error: {e}")