Spaces:

AlainDeLong
/

Social-Sentiment-Analysis

Running

App Files Files Community

AlainDeLong commited on May 11

Commit

fa7aa9f

1 Parent(s): cc4fd89

create app

Browse files

Files changed (12) hide show

.gitignore +183 -0
requirements.txt +8 -3
src/fine_tuned_model/config.json +37 -0
src/fine_tuned_model/merges.txt +0 -0
src/fine_tuned_model/model.safetensors +3 -0
src/fine_tuned_model/special_tokens_map.json +15 -0
src/fine_tuned_model/tokenizer.json +0 -0
src/fine_tuned_model/tokenizer_config.json +58 -0
src/fine_tuned_model/vocab.json +0 -0
src/predict.py +473 -0
src/streamlit_app.py +709 -40
src/youtube.py +77 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,183 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Streamlit Secrets
+.streamlit/
+# Youtube Links
+src/link.txt
+# Test files
+src/test_plotly_script.py

requirements.txt CHANGED Viewed

@@ -1,3 +1,8 @@
-altair
-pandas
-streamlit

+altair==5.5.0
+pandas==2.2.2
+streamlit==1.45.0
+torch==2.5.1
+transformers==4.46.2
+regex==2024.11.6
+plotly==6.0.1
+google-api-python-client==2.169.0

src/fine_tuned_model/config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "architectures": [
+    "RobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "negative",
+    "1": "neutral",
+    "2": "positive"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "negative": 0,
+    "neutral": 1,
+    "positive": 2
+  },
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

src/fine_tuned_model/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

src/fine_tuned_model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff26327999e09e76218bd59e2f78b1445a2720ea58fb27c15f47ae3f1e6cd42e
+size 498615900

src/fine_tuned_model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

src/fine_tuned_model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

src/fine_tuned_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50264": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

src/fine_tuned_model/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

src/predict.py ADDED Viewed

	@@ -0,0 +1,473 @@

+# src/predict.py
+import os  # To help build file paths correctly
+import torch  # PyTorch library, for tensors and model operations
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+)  # Hugging Face stuff for models
+# --- Configuration ---
+# This is where our fine-tuned model and tokenizer files are stored
+# Assuming 'fine_tuned_model' directory is inside 'src/' and next to this predict.py file
+_SCRIPT_DIR = os.path.dirname(
+    os.path.abspath(__file__)
+)  # Gets the directory where this script is
+MODEL_PATH = os.path.join(
+    _SCRIPT_DIR, "fine_tuned_model"
+)  # User confirmed this variable name and directory
+print(f"DEBUG (predict.py): Model path set to: {MODEL_PATH}")  # For checking the path
+# --- Device Setup ---
+# Check if a GPU is available, otherwise use CPU
+# Using GPU makes predictions much faster!
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+    # Trying to get the name of the GPU, just for information
+    try:
+        gpu_name = torch.cuda.get_device_name(0)
+        print(f"INFO (predict.py): GPU is available ({gpu_name}), using CUDA.")
+    except Exception as e:
+        print(
+            f"INFO (predict.py): GPU is available, using CUDA. (Could not get GPU name: {e})"
+        )
+else:
+    device = torch.device("cpu")
+    print(
+        "INFO (predict.py): GPU not available, using CPU. Predictions might be slower."
+    )
+# --- Load Model and Tokenizer ---
+# We load these once when the script (or module) is first loaded.
+# This is much better than loading them every time we want to predict.
+model = None
+tokenizer = None
+id2label_mapping = {0: "negative", 1: "neutral", 2: "positive"}  # Default mapping
+try:
+    print(f"INFO (predict.py): Loading model from {MODEL_PATH}...")
+    # Load the pre-trained model for sequence classification
+    # This should be the PyTorch RoBERTa model we fine-tuned
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
+    model.to(device)  # Move the model to the GPU (or CPU if no GPU)
+    model.eval()  # Set the model to evaluation mode (important for layers like Dropout)
+    print("INFO (predict.py): Model loaded successfully and set to evaluation mode.")
+    print(f"INFO (predict.py): Loading tokenizer from {MODEL_PATH}...")
+    # Load the tokenizer that matches the model
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+    print("INFO (predict.py): Tokenizer loaded successfully.")
+    # Get the label mapping from the model's configuration
+    # This was saved during fine-tuning
+    if hasattr(model.config, "id2label") and model.config.id2label:
+        id2label_mapping = model.config.id2label
+        # Convert string keys from config.json to int if necessary
+        id2label_mapping = {int(k): v for k, v in id2label_mapping.items()}
+        print(
+            f"INFO (predict.py): Loaded id2label mapping from model config: {id2label_mapping}"
+        )
+    else:
+        print(
+            "WARN (predict.py): id2label not found in model config, using default mapping."
+        )
+except FileNotFoundError:
+    print(f"--- CRITICAL ERROR (predict.py) ---")
+    print(f"Model or Tokenizer files NOT FOUND at the specified path: {MODEL_PATH}")
+    print(
+        f"Please ensure the '{os.path.basename(MODEL_PATH)}' directory exists at '{_SCRIPT_DIR}' and contains all necessary model files (pytorch_model.bin/model.safetensors, config.json, tokenizer files, etc.)."
+    )
+    # Keep model and tokenizer as None, so predict_sentiments can handle it
+except Exception as e:
+    print(f"--- ERROR (predict.py) ---")
+    print(f"An unexpected error occurred loading model or tokenizer: {e}")
+    # Keep model and tokenizer as None
+# --- Preprocessing Function ---
+# Same function we used for training data to make sure inputs are consistent
+def preprocess_tweet(text):
+    """Replaces @user mentions and http links with placeholders."""
+    preprocessed_text = []
+    if text is None:
+        return ""  # Handle None input
+    # Split text into parts by space
+    for t in text.split(" "):
+        if len(t) > 0:  # Avoid processing empty parts from multiple spaces
+            t = "@user" if t.startswith("@") else t  # Replace mentions
+            t = "http" if t.startswith("http") else t  # Replace links
+        preprocessed_text.append(t)
+    return " ".join(preprocessed_text)  # Put the parts back together
+# --- Prediction Function (UPDATED to return probabilities) ---
+def predict_sentiments(comment_list: list):
+    """
+    Predicts sentiments for a list of comment strings.
+    Returns a list of dictionaries, each containing the predicted label
+    and the probabilities (scores) for each class.
+    e.g., [{'label': 'positive', 'scores': {'negative': 0.1, 'neutral': 0.2, 'positive': 0.7}}, ...]
+    """
+    # Check if model and tokenizer are ready
+    if model is None or tokenizer is None:
+        print(
+            "ERROR (predict.py - predict_sentiments): Model or Tokenizer not loaded. Cannot predict."
+        )
+        # Return an error structure
+        return [{"label": "Error: Model not loaded", "scores": {}}] * len(comment_list)
+    if not comment_list:  # Handle empty input list
+        return []
+    """
+    # Preprocess comments first
+    processed_comments = [preprocess_tweet(comment) for comment in comment_list]
+    # Tokenize the batch
+    print(
+        f"DEBUG (predict.py): Tokenizing {len(processed_comments)} comments for prediction..."
+    )
+    inputs = tokenizer(
+        processed_comments,
+        padding=True,
+        truncation=True,
+        return_tensors="pt",  # PyTorch tensors
+        max_length=(
+            tokenizer.model_max_length
+            if hasattr(tokenizer, "model_max_length") and tokenizer.model_max_length
+            else 512
+        ),
+    )
+    # Move inputs to the correct device
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    results_list = []  # To store the dictionaries
+    try:
+        # Perform inference without calculating gradients
+        with torch.no_grad():
+            outputs = model(**inputs)
+            logits = outputs.logits
+        # Apply Softmax to convert logits to probabilities
+        # dim=-1 applies softmax across the last dimension (the classes)
+        probabilities = torch.softmax(logits, dim=-1)
+        # Get the predicted class IDs (index of the highest probability)
+        predicted_class_ids = torch.argmax(probabilities, dim=-1)
+        # Move results to CPU and convert to Python lists/numpy for easier handling
+        probs_list = (
+            probabilities.cpu().numpy().tolist()
+        )  # List of lists of probabilities
+        ids_list = predicted_class_ids.cpu().numpy().tolist()  # List of predicted IDs
+        print(
+            f"DEBUG (predict.py): Probabilities and IDs calculated. Batch size: {len(ids_list)}"
+        )
+        # Format the output: list of dictionaries
+        for i in range(len(ids_list)):
+            pred_id = ids_list[i]
+            # Map predicted ID to label string using the mapping from model config
+            pred_label = id2label_mapping.get(pred_id, "Unknown")
+            # Create the dictionary of scores {label_name: probability}
+            pred_scores = {
+                label_name: probs_list[i][label_id]
+                for label_id, label_name in id2label_mapping.items()
+                # Ensure index is within bounds, just in case
+                if 0 <= label_id < probabilities.shape[-1]
+            }
+            # Append the result for this comment
+            results_list.append({"label": pred_label, "scores": pred_scores})
+    except Exception as e:
+        print(f"--- ERROR (predict.py - predict_sentiments) ---")
+        print(f"Error during sentiment prediction inference or formatting: {e}")
+        import traceback
+        traceback.print_exc()  # Print full traceback for debugging
+        # Return error structure for each comment
+        results_list = [
+            {"label": "Error: Prediction failed", "scores": {}} for _ in comment_list
+        ]
+    return results_list  # Return the list of dictionaries
+    """
+    inference_batch_size = 64  # You can adjust this number based on performance/memory
+    print(
+        f"INFO (predict.py): Predicting sentiments for {len(comment_list)} comments in batches of {inference_batch_size}..."
+    )
+    all_results_list = []  # We'll collect results for all batches here
+    # --- Loop through the comment list in batches ---
+    try:
+        total_comments = len(comment_list)
+        # This loop goes from 0 to total_comments, jumping by inference_batch_size each time
+        for i in range(0, total_comments, inference_batch_size):
+            # Get the current slice of comments for this batch
+            batch_comments = comment_list[i : i + inference_batch_size]
+            # Just printing progress for long lists
+            current_batch_num = i // inference_batch_size + 1
+            total_batches = (
+                total_comments + inference_batch_size - 1
+            ) // inference_batch_size
+            print(
+                f"DEBUG (predict.py): Processing batch {current_batch_num}/{total_batches}..."
+            )
+            # --- Process ONLY the current batch ---
+            # 1. Preprocess this specific batch
+            processed_batch = [preprocess_tweet(comment) for comment in batch_comments]
+            # 2. Tokenize this batch
+            # Tokenizer handles padding within this smaller batch
+            inputs = tokenizer(
+                processed_batch,
+                padding=True,
+                truncation=True,
+                return_tensors="pt",
+                max_length=(
+                    tokenizer.model_max_length
+                    if hasattr(tokenizer, "model_max_length")
+                    and tokenizer.model_max_length
+                    else 512
+                ),
+            )
+            # 3. Move this batch's inputs to the device (GPU/CPU)
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+            # 4. Make prediction for this batch - no need for gradients
+            with torch.no_grad():
+                outputs = model(**inputs)
+                logits = outputs.logits  # Raw scores from the model for this batch
+            # 5. Calculate probabilities and get predicted class IDs for this batch
+            probabilities_batch = torch.softmax(logits, dim=-1)
+            predicted_class_ids_batch = torch.argmax(probabilities_batch, dim=-1)
+            # 6. Move results back to CPU, convert to lists for easier looping
+            probs_list_batch = probabilities_batch.cpu().numpy().tolist()
+            ids_list_batch = predicted_class_ids_batch.cpu().numpy().tolist()
+            # 7. Format results for each comment in THIS batch
+            batch_results = []
+            for j in range(len(ids_list_batch)):
+                pred_id = ids_list_batch[j]
+                pred_label = id2label_mapping.get(
+                    pred_id, "Unknown"
+                )  # Map ID to label name
+                # Create the scores dictionary for this comment
+                pred_scores = {
+                    label_name: probs_list_batch[j][label_id]
+                    for label_id, label_name in id2label_mapping.items()
+                    if 0
+                    <= label_id
+                    < probabilities_batch.shape[-1]  # Safety check for index
+                }
+                # Add the result for this comment
+                batch_results.append({"label": pred_label, "scores": pred_scores})
+            # Add the results from this completed batch to our main list
+            all_results_list.extend(batch_results)
+            # --- Finished processing current batch ---
+        print(
+            f"INFO (predict.py): Finished processing all {len(all_results_list)} comments."
+        )
+    except Exception as e:
+        # Catch errors that might happen during the loop
+        print(f"--- ERROR (predict.py - predict_sentiments loop) ---")
+        print(
+            f"An error occurred during batch prediction (around comment index {i}): {e}"
+        )
+        import traceback
+        traceback.print_exc()  # Print full error details to console
+        # Try to return results for processed batches + error messages for the rest
+        num_processed = len(all_results_list)
+        num_remaining = len(comment_list) - num_processed
+        # Add error indicators for comments that couldn't be processed
+        all_results_list.extend(
+            [{"label": "Error: Batch failed", "scores": {}}] * num_remaining
+        )
+    # Return the list containing results for all comments
+    return all_results_list
+# --- Main block for testing this script directly (UPDATED to show scores) ---
+if __name__ == "__main__":
+    print("\n--- Testing predict.py Script Directly ---")
+    if model and tokenizer:
+        sample_comments_for_testing = [
+            "This is an amazing movie, I loved it!",
+            "I'm not sure how I feel about this, it was okay.",
+            "Worst experience ever, would not recommend.",
+            "The food was alright, but the service was slow.",
+            "What a fantastic day! #blessed",
+            "I hate waiting in long lines.",
+            "@user Check out http this is cool.",
+            "Just a normal sentence, nothing special here.",
+            "",
+            "This new update is absolutely terrible and full of bugs.",
+        ]
+        print("\nInput Comments for Direct Test:")
+        for i, c in enumerate(sample_comments_for_testing):
+            print(f"{i+1}. '{c}'")
+        # Get predictions (now a list of dictionaries)
+        prediction_results = predict_sentiments(sample_comments_for_testing)
+        print("\nPredicted Sentiments and Scores (Direct Test):")
+        # Loop through the results list
+        for i, (comment, result) in enumerate(
+            zip(sample_comments_for_testing, prediction_results)
+        ):
+            print(f"{i+1}. Comment: '{comment}'")
+            # Format scores nicely for printing
+            scores_dict = result.get("scores", {})
+            formatted_scores = ", ".join(
+                [f"{name}: {score:.3f}" for name, score in scores_dict.items()]
+            )
+            print(f"   -> Predicted Label: {result.get('label', 'N/A')}")
+            # Also print the raw scores dictionary
+            print(f"   -> Scores: {{{formatted_scores}}}")
+        print("--- Direct Test Finished ---")
+    else:
+        print("ERROR (predict.py - main test): Model and/or tokenizer not loaded.")
+        print(
+            f"Please check the MODEL_PATH ('{MODEL_PATH}') and ensure model files are present."
+        )
+# # --- Prediction Function ---
+# def predict_sentiments(comment_list: list):
+#     """
+#     Predicts sentiments for a list of comment strings.
+#     Returns a list of sentiment labels (e.g., "positive", "neutral", "negative").
+#     """
+#     # Check if model and tokenizer were loaded properly
+#     if model is None or tokenizer is None:
+#         print(
+#             "ERROR (predict.py - predict_sentiments): Model or Tokenizer not loaded. Cannot make predictions."
+#         )
+#         # Return an error message for each comment if model isn't ready
+#         return ["Error: Model not loaded"] * len(comment_list)
+#     if not comment_list:  # If the input list is empty
+#         return []
+#     # First, preprocess all comments like we did for training data
+#     processed_comments = [preprocess_tweet(comment) for comment in comment_list]
+#     # Tokenize the processed comments
+#     # This turns text into numbers (input IDs, attention mask) for the model
+#     # padding=True: make all sequences in the batch the same length
+#     # truncation=True: cut off sequences longer than the model can handle
+#     # return_tensors="pt": return PyTorch tensors
+#     # max_length: ensure we don't exceed model's limit (e.g., 512 for RoBERTa)
+#     print(f"DEBUG (predict.py): Tokenizing {len(processed_comments)} comments...")
+#     inputs = tokenizer(
+#         processed_comments,
+#         padding=True,
+#         truncation=True,
+#         return_tensors="pt",
+#         max_length=(
+#             tokenizer.model_max_length
+#             if hasattr(tokenizer, "model_max_length") and tokenizer.model_max_length
+#             else 512
+#         ),
+#     )
+#     # Move the tokenized inputs to the same device as the model (GPU or CPU)
+#     inputs = {k: v.to(device) for k, v in inputs.items()}
+#     sentiment_labels_as_strings = []
+#     try:
+#         # Make predictions
+#         # torch.no_grad() is important for inference:
+#         # it tells PyTorch not to calculate gradients, saving memory and speeding things up.
+#         with torch.no_grad():
+#             outputs = model(**inputs)  # Get model outputs
+#             logits = outputs.logits  # These are the raw scores from the final layer
+#         # Get the predicted class ID by finding the index with the highest score (logit)
+#         # logits shape is (batch_size, num_labels)
+#         predicted_class_ids = torch.argmax(
+#             logits, dim=-1
+#         )  # dim=-1 means find max along the last dimension
+#         # Convert the predicted class IDs (numbers) to actual sentiment labels (strings)
+#         # using the id2label_mapping we got from the model's config
+#         # .item() gets the Python number from a 0-dim PyTorch tensor
+#         sentiment_labels_as_strings = [
+#             id2label_mapping.get(class_id.item(), "Unknown")
+#             for class_id in predicted_class_ids
+#         ]
+#         print(
+#             f"DEBUG (predict.py): Predictions made. Example: {sentiment_labels_as_strings[:3] if sentiment_labels_as_strings else 'N/A'}"
+#         )
+#     except Exception as e:
+#         print(f"--- ERROR (predict.py - predict_sentiments) ---")
+#         print(f"Error during sentiment prediction inference: {e}")
+#         # Return an error message for each comment if prediction fails
+#         sentiment_labels_as_strings = ["Error: Prediction failed"] * len(comment_list)
+#     return sentiment_labels_as_strings
+# # --- Main block for testing this script directly ---
+# # This part only runs if you execute 'python src/predict.py' from the terminal
+# # It won't run when app.py imports this file.
+# if __name__ == "__main__":
+#     print("\n--- Testing predict.py Script Directly ---")
+#     # Check if model was loaded, otherwise can't test
+#     if model and tokenizer:
+#         sample_comments_for_testing = [
+#             "This is an amazing movie, I loved it!",  # Expected: positive
+#             "I'm not sure how I feel about this, it was okay.",  # Expected: neutral
+#             "Worst experience ever, would not recommend.",  # Expected: negative
+#             "The food was alright, but the service was slow.",  # Expected: neutral or negative
+#             "What a fantastic day! #blessed",  # Expected: positive
+#             "I hate waiting in long lines.",  # Expected: negative
+#             "@user Check out http this is cool.",  # Test preprocessing, Expected: positive or neutral
+#             "Just a normal sentence, nothing special here.",  # Expected: neutral
+#             "",  # Empty string test
+#             "This new update is absolutely terrible and full of bugs.",  # Expected: negative
+#         ]
+#         print("\nInput Comments for Direct Test:")
+#         for i, c in enumerate(sample_comments_for_testing):
+#             print(f"{i + 1}. '{c}'")
+#         # Get predictions using our main function
+#         predicted_sentiments = predict_sentiments(sample_comments_for_testing)
+#         print("\nPredicted Sentiments (Direct Test):")
+#         for i, (comment, sentiment) in enumerate(
+#             zip(sample_comments_for_testing, predicted_sentiments)
+#         ):
+#             print(
+#                 f"{i + 1}. Comment: '{comment}'\n   -> Predicted Sentiment: {sentiment}"
+#             )
+#         print("--- Direct Test Finished ---")
+#     else:
+#         print(
+#             "ERROR (predict.py - main test): Model and/or tokenizer not loaded. Cannot run direct test."
+#         )
+#         print(
+#             f"Please check the MODEL_PATH ('{MODEL_PATH}') and ensure model files are present."
+#         )

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,709 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+# src/streamlit_app.py
+import streamlit as st
+import pandas as pd
+import re  # For robust YouTube video ID extraction
+# Try to import Plotly, if not available, we'll use Streamlit's basic charts
+try:
+    import plotly.express as px
+    PLOTLY_AVAILABLE = True
+except ImportError:
+    PLOTLY_AVAILABLE = False
+    st.sidebar.warning(
+        "Plotly not installed. Charts will be basic. Consider 'pip install plotly'."
+    )  # Optional warning
+# Import our custom modules from the src directory
+try:
+    from predict import (
+        predict_sentiments,
+    )  # This function should return list of strings: "positive", "negative", "neutral"
+    from youtube import (
+        get_video_comments,
+    )  # This function should return a list of comment strings
+except ImportError as e:
+    st.error(
+        f"Failed to import necessary modules (predict.py, youtube.py). Ensure they are in the 'src' directory. Error: {e}"
+    )
+    # Stop the app if core modules are missing
+    st.stop()
+def extract_video_id(url_or_id: str) -> str | None:
+    """
+    Tries to get the YouTube video ID from different common URL types.
+    Also handles if the input is just the ID itself.
+    A bit of regex to find the ID part in common URLs.
+    """
+    if not url_or_id:
+        return None
+    # Patterns for various YouTube URL formats
+    # Order matters: more specific patterns should come first if overlap exists
+    patterns = [
+        r"watch\?v=([a-zA-Z0-9_-]{11})",  # Standard watch URL
+        r"youtu\.be/([a-zA-Z0-9_-]{11})",  # Shortened URL
+        r"embed/([a-zA-Z0-9_-]{11})",  # Embed URL
+        r"shorts/([a-zA-Z0-9_-]{11})",  # Shorts URL
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, url_or_id)
+        if match:
+            return match.group(1)  # The first capturing group is the ID
+    # If no pattern matches, check if the input itself is a valid 11-char ID
+    # Basic check: 11 chars, no spaces, not starting with http (already handled by regex above implicitly)
+    if len(url_or_id) == 11 and not (
+        "/" in url_or_id or "?" in url_or_id or "=" in url_or_id or "." in url_or_id
+    ):
+        return url_or_id  # Assume it's a direct ID
+    return None  # Return None if no ID found
+def analyze_youtube_video(video_url_or_id: str):
+    """
+    Main function for the YouTube analysis part.
+    It gets comments, then predicts their sentiments.
+    Then it summarizes the results.
+    """
+    video_id = extract_video_id(video_url_or_id)
+    if not video_id:
+        # Give a more helpful error message to the user
+        st.error(
+            "Oops! That doesn't look like a valid YouTube URL or Video ID. Please check and try again. Example: Z9kGRMglw-I or youtu.be/3?v=Z9kGRMglw-I"
+        )
+        return None  # Stop if no valid ID
+    summary_data = {}  # Initialize
+    # comments_with_sentiments = []  # Initialize
+    try:
+        with st.spinner(f"Fetching comments & title for video ID: {video_id}..."):
+            video_data = get_video_comments(video_id)
+            comments_text_list = video_data.get("comments", [])
+            video_title = video_data.get("title", "Video Title Not Found")
+            print(
+                f"DEBUG (streamlit_app.py): Received title from youtube.py: '{video_title}'"
+            )
+        # Check if we actually got any comments
+        if not comments_text_list:
+            st.warning(
+                "Hmm, no comments found for this video. Are comments enabled? Or is it a very new video?"
+            )
+            # Provide a default empty summary structure
+            summary_data = {
+                "num_comments_fetched": 0,
+                "num_comments_analyzed": 0,
+                "positive": 0,
+                "neutral": 0,
+                "negative": 0,
+                "positive_percentage": 0,
+                "neutral_percentage": 0,
+                "negative_percentage": 0,
+                "num_valid_predictions": 0,
+            }
+            return {"summary": summary_data, "comments_data": []}
+        st.info(
+            f"Great! Found {len(comments_text_list)} comments. Now thinking about their feelings (sentiments)..."
+        )
+        # Another spinner for the prediction part, as this can be slow on CPU
+        with st.spinner("Analyzing sentiments with the model... Please wait."):
+            # This calls predict_sentiments from predict.py
+            # Expected to return: ["positive", "negative", "neutral", ...]
+            prediction_results = predict_sentiments(comments_text_list)
+        positive_count = 0
+        negative_count = 0
+        neutral_count = 0
+        error_count = 0
+        for result in prediction_results:
+            label = result.get("label")
+            if label == "positive":
+                positive_count += 1
+            elif label == "negative":
+                negative_count += 1
+            elif label == "neutral":
+                neutral_count += 1
+            else:
+                error_count += 1
+        num_valid_predictions = positive_count + negative_count + neutral_count
+        total_comments_processed = len(prediction_results)
+        if error_count > 0:
+            st.warning(
+                f"Could not predict sentiment properly for {error_count} comments."
+            )
+        summary_data = {
+            "video_title": video_title,
+            "num_comments_fetched": len(comments_text_list),
+            "num_comments_analyzed": total_comments_processed,
+            "num_valid_predictions": num_valid_predictions,
+            "positive": positive_count,
+            "negative": negative_count,
+            "neutral": neutral_count,
+            "positive_percentage": (
+                (positive_count / num_valid_predictions) * 100
+                if num_valid_predictions > 0
+                else 0
+            ),
+            "neutral_percentage": (
+                (neutral_count / num_valid_predictions) * 100
+                if num_valid_predictions > 0
+                else 0
+            ),
+            "negative_percentage": (
+                (negative_count / num_valid_predictions) * 100
+                if num_valid_predictions > 0
+                else 0
+            ),
+        }
+        comments_data_for_df = []
+        for i in range(len(comments_text_list)):
+            comment_text = comments_text_list[i]
+            result = prediction_results[i]
+            label = result.get("label", "Error")
+            scores = result.get("scores", {})
+            confidence = max(scores.values()) if scores else 0.0
+            comments_data_for_df.append(
+                {
+                    "Comment Text": comment_text,
+                    "Predicted Sentiment": label,
+                    "Confidence": confidence,
+                    # "All Scores": scores
+                }
+            )
+        return {"summary": summary_data, "comments_data": comments_data_for_df}
+    except Exception as e:
+        # Show a general error if anything unexpected happens
+        st.error(f"Uh oh! An error popped up during analysis: {str(e)}")
+        # Also print to console for more detailed debugging when running locally
+        print(f"Full error in analyze_youtube_video: {e}")
+        import traceback
+        traceback.print_exc()  # Print full traceback to console
+        return None  # Return None on error
+# --- Streamlit App UI ---
+# Page configuration: Set to centered layout (default) instead of "wide"
+st.set_page_config(page_title="Social Sentiment Analysis", layout="centered")
+st.title("📊 SOCIAL SENTIMENT ANALYSIS")
+# A little description for the user
+st.write(
+    """
+    Welcome to the **Social Sentiment Analyzer!** 👋
+    This application uses a fine-tuned RoBERTa model to predict the sentiment (Positive, Neutral, or Negative) expressed in text.
+    Use the tabs below to choose your input method:
+    * **Analyze Text Input:** Paste or type any English text directly.
+    * **YouTube Analysis:** Enter a YouTube video URL or ID to analyze its comments.
+    * **Twitter/X Analysis:** Support for analyzing Twitter/X posts is coming soon!
+    Select a tab to begin!
+    """
+)
+# Tabs for different platforms, makes it easy to add Twitter later
+tab_text_input, tab_youtube, tab_twitter = st.tabs(
+    ["Analyze Text Input", "YouTube Analysis", "Twitter/X Analysis (Coming Soon!)"]
+)
+with tab_text_input:
+    # Header for this tab
+    st.header("Analyze Sentiment of Your Text")
+    st.write(
+        "Enter a sentence or a short paragraph below to see its predicted sentiment distribution."
+    )
+    # Use text_area for potentially longer input
+    # Giving it a unique key helps maintain state if needed
+    user_text = st.text_area(
+        "Enter text here:",
+        key="text_input_area_key",
+        height=100,
+        placeholder="Type or paste your text...",
+    )
+    # Button to trigger the analysis
+    if st.button("Analyze Text", key="text_input_analyze_btn"):
+        # Check if the user actually entered something (not just whitespace)
+        if user_text and not user_text.isspace():
+            # Show a spinner while processing
+            with st.spinner("Analyzing your text..."):
+                try:
+                    # Call the prediction function from predict.py
+                    # Pass the input text as a list with one element
+                    prediction_results = predict_sentiments([user_text])
+                    # Check if prediction was successful and returned expected format
+                    if (
+                        prediction_results
+                        and isinstance(prediction_results, list)
+                        and len(prediction_results) > 0
+                    ):
+                        # Get the result dictionary for the single input text
+                        result = prediction_results[0]
+                        predicted_label = result.get("label")
+                        scores = result.get(
+                            "scores"
+                        )  # This should be a dict like {'negative': 0.1, ...}
+                        # Make sure we got a valid label and scores dictionary
+                        if (
+                            predicted_label
+                            and scores
+                            and isinstance(scores, dict)
+                            and predicted_label != "Error"
+                        ):
+                            # Display the top predicted sentiment
+                            st.subheader("Predicted Sentiment:")
+                            # Using Streamlit's built-in status elements for color
+                            if predicted_label == "positive":
+                                st.success(
+                                    f"The model thinks the sentiment is: **{predicted_label.capitalize()}** 👍"
+                                )
+                            elif predicted_label == "negative":
+                                st.error(
+                                    f"The model thinks the sentiment is: **{predicted_label.capitalize()}** 👎"
+                                )
+                            else:  # Neutral or potentially "Unknown" if mapping failed
+                                st.info(
+                                    f"The model thinks the sentiment is: **{predicted_label.capitalize()}** 😐"
+                                )
+                            st.write("---")  # Adding a small separator
+                            st.subheader(
+                                "Detailed Probabilities:"
+                            )  # Subheader for this section
+                            if scores and isinstance(scores, dict):
+                                # Using columns here helps align the probabilities nicely
+                                prob_col_neg, prob_col_neu, prob_col_pos = st.columns(3)
+                                # Helper to get score safely
+                                def get_score(sentiment_name):
+                                    return scores.get(
+                                        sentiment_name.lower(), 0.0
+                                    )  # Use lowercase to be safe
+                                value_font_size = "22px"
+                                value_font_weight = "bold"
+                                with prob_col_neg:
+                                    neg_prob = get_score("negative")
+                                    # Display label "Negative"
+                                    st.markdown("**Negative 👎:**")
+                                    # Display the probability, larger font, red color
+                                    st.markdown(
+                                        f"<p style='font-size: {value_font_size}; font-weight: {value_font_weight}; color:red;'>{neg_prob:.1%}</p>",
+                                        unsafe_allow_html=True,
+                                    )
+                                with prob_col_neu:
+                                    neu_prob = get_score("neutral")
+                                    # Display label "Neutral"
+                                    st.markdown("**Neutral 😐:**")
+                                    # Display the probability, larger font, grey color
+                                    st.markdown(
+                                        f"<p style='font-size: {value_font_size}; font-weight: {value_font_weight}; color:grey;'>{neu_prob:.1%}</p>",
+                                        unsafe_allow_html=True,
+                                    )
+                                with prob_col_pos:
+                                    pos_prob = get_score("positive")
+                                    # Display label "Positive"
+                                    st.markdown("**Positive 👍:**")
+                                    # Display the probability, larger font, green color
+                                    st.markdown(
+                                        f"<p style='font-size: {value_font_size}; font-weight: {value_font_weight}; color:green;'>{pos_prob:.1%}</p>",
+                                        unsafe_allow_html=True,
+                                    )
+                            else:
+                                # If scores dict is missing or invalid
+                                st.write("Could not retrieve probability scores.")
+                            st.write("---")  # Another separator before the chart
+                            # --- Display Pie Chart of Probabilities ---
+                            st.subheader("Sentiment Probabilities:")
+                            if PLOTLY_AVAILABLE:
+                                # Convert the scores dictionary to a DataFrame suitable for Plotly
+                                # Ensure keys match class_names for consistency if possible
+                                # Assuming scores keys are 'negative', 'neutral', 'positive'
+                                score_items = list(scores.items())
+                                if score_items:  # Check if scores dict is not empty
+                                    df_scores = pd.DataFrame(
+                                        score_items,
+                                        columns=["Sentiment", "Probability"],
+                                    )
+                                    # Convert Probability to numeric just in case
+                                    df_scores["Probability"] = pd.to_numeric(
+                                        df_scores["Probability"]
+                                    )
+                                    # Define colors (ensure keys match Sentiment names case)
+                                    color_map = {
+                                        "positive": "green",
+                                        "neutral": "grey",
+                                        "negative": "red",
+                                    }
+                                    # Make keys lowercase for robust mapping
+                                    df_scores["Sentiment"] = df_scores[
+                                        "Sentiment"
+                                    ].str.capitalize()
+                                    df_scores["Sentiment_Lower"] = df_scores[
+                                        "Sentiment"
+                                    ].str.lower()
+                                    color_map_lower = {
+                                        k.lower(): v for k, v in color_map.items()
+                                    }
+                                    # Debug print for the dataframe fed to plotly
+                                    # st.write("DEBUG: DataFrame for text input pie chart:")
+                                    # st.dataframe(df_scores)
+                                    try:
+                                        # Create the pie chart
+                                        fig_pie_text = px.pie(
+                                            df_scores,
+                                            values="Probability",  # Use the probability column
+                                            names="Sentiment",  # Labels for the slices
+                                            title="Probability Distribution per Class",
+                                            color="Sentiment_Lower",  # Use lowercase for mapping
+                                            color_discrete_map=color_map_lower,
+                                        )  # Map colors
+                                        # Update how text is shown on slices
+                                        fig_pie_text.update_traces(
+                                            textposition="inside",
+                                            textinfo="percent+label",
+                                            hovertemplate="Sentiment: %{label}<br>Probability: %{percent}",
+                                        )
+                                        # Maybe add hover info too
+                                        fig_pie_text.update_layout(
+                                            uniformtext_minsize=16,
+                                            uniformtext_mode="hide",
+                                        )  # Improve text fitting
+                                        st.plotly_chart(
+                                            fig_pie_text, use_container_width=True
+                                        )
+                                    except Exception as plot_e:
+                                        st.error(
+                                            f"Sorry, couldn't create the probability pie chart: {str(plot_e)}"
+                                        )
+                                        print(
+                                            f"Full error during text input Plotly chart generation: {plot_e}"
+                                        )
+                                        import traceback
+                                        traceback.print_exc()
+                                        st.write(
+                                            "Raw scores:", scores
+                                        )  # Show raw scores as fallback
+                                else:  # If scores dictionary was empty
+                                    st.warning(
+                                        "Received empty scores, cannot plot chart."
+                                    )
+                            elif not PLOTLY_AVAILABLE:
+                                st.warning(
+                                    "Plotly not installed, cannot display pie chart. Showing raw scores instead."
+                                )
+                                st.json(
+                                    scores
+                                )  # Display raw scores as JSON if no Plotly
+                            else:
+                                # This case should be covered by the check above, but for safety
+                                st.write("No valid score data available to plot.")
+                            # --- End Pie Chart ---
+                        else:
+                            # This handles cases where predict_sentiments returned an error label
+                            st.error(
+                                f"Sentiment analysis failed for the input text. Result: {result}"
+                            )
+                    else:
+                        # This handles cases where predict_sentiments returned None or empty list
+                        st.error(
+                            "Received no valid result from the prediction function."
+                        )
+                except Exception as analysis_e:
+                    # Catch-all for other errors during analysis for this tab
+                    st.error(
+                        f"An error occurred during text analysis: {str(analysis_e)}"
+                    )
+                    print(f"Full error during text input analysis: {analysis_e}")
+                    import traceback
+                    traceback.print_exc()
+        else:
+            # If user clicks button without entering text
+            st.warning("Please enter some text in the text area first!")
+with tab_youtube:
+    st.header("YouTube Comment Sentiment Analyzer")
+    # Input field for URL or ID
+    video_url_input = st.text_input(
+        "Enter YouTube Video URL or Video ID:",
+        key="youtube_url_input_key",  # Giving it a unique key
+        placeholder="e.g., Z9kGRMglw-I or full URL",
+    )
+    # Button to trigger analysis
+    if st.button("Analyze YouTube Comments", key="youtube_analyze_button_key"):
+        if video_url_input:  # Check if user actually entered something
+            # analyze_youtube_video handles spinners internally now
+            analysis_results = analyze_youtube_video(video_url_input)
+            if (
+                analysis_results and analysis_results["summary"]
+            ):  # Check if we got valid results
+                summary = analysis_results["summary"]
+                comments_data = analysis_results["comments_data"]
+                video_title_display = summary.get(
+                    "video_title", "Video Title Not Available"
+                )
+                st.markdown("---")
+                # Displaying the video title using markdown for potential formatting later
+                st.markdown(f"### Analyzing Video: **{video_title_display}**")
+                st.markdown("---")
+                st.subheader("📊 Sentiment Summary")
+                # Define desired font sizes (you can adjust these)
+                # label_font_size = (
+                #     "24px"  # Font size for the label text like "Comments Fetched"
+                # )
+                label_font_size = "24px"
+                value_font_size = "28px"  # Font size for the actual count like "137"
+                value_font_weight = "bold"  # Make the count bold
+                # Define colors for the sentiment counts
+                positive_color = "green"
+                neutral_color = "grey"
+                negative_color = "red"
+                # Using 5 columns
+                col_fetched, col_analyzed, col_pos, col_neu, col_neg = st.columns(5)
+                # Metric 1: Comments Fetched
+                with col_fetched:
+                    # Label for fetched comments
+                    st.markdown(
+                        f"<p style='font-size: {label_font_size}; margin-bottom: 0px;'>Comments Fetched</p>",
+                        unsafe_allow_html=True,
+                    )
+                    # The number of fetched comments
+                    st.markdown(
+                        f"<p style='font-size: {value_font_size}; font-weight: {value_font_weight}; margin-top: 0px;'>{summary.get('num_comments_fetched', 0)}</p>",
+                        unsafe_allow_html=True,
+                    )
+                # Metric 2: Comments Analyzed
+                with col_analyzed:
+                    # Label for analyzed comments
+                    st.markdown(
+                        f"<p style='font-size: {label_font_size}; margin-bottom: 0px;'>Comments Analyzed</p>",
+                        unsafe_allow_html=True,
+                    )
+                    # The number of analyzed comments
+                    st.markdown(
+                        f"<p style='font-size: {value_font_size}; font-weight: {value_font_weight}; margin-top: 0px;'>{summary.get('num_comments_analyzed', 0)}</p>",
+                        unsafe_allow_html=True,
+                    )
+                # Metric 3: Positive
+                with col_pos:
+                    # Label for positive comments, with emoji
+                    st.markdown(
+                        f"<p style='font-size: {label_font_size}; margin-bottom: 0px;'>Positive 👍</p>",
+                        unsafe_allow_html=True,
+                    )
+                    # The count of positive comments, green and bold
+                    st.markdown(
+                        f"<p style='font-size: {value_font_size}; font-weight: {value_font_weight}; color:{positive_color}; margin-top: 0px;'>{summary.get('positive', 0)}</p>",
+                        unsafe_allow_html=True,
+                    )
+                # Metric 4: Neutral
+                with col_neu:
+                    # Label for neutral comments
+                    st.markdown(
+                        f"<p style='font-size: {label_font_size}; margin-bottom: 0px;'>Neutral 😐</p>",
+                        unsafe_allow_html=True,
+                    )
+                    # The count of neutral comments, grey and bold
+                    st.markdown(
+                        f"<p style='font-size: {value_font_size}; font-weight: {value_font_weight}; color:{neutral_color}; margin-top: 0px;'>{summary.get('neutral', 0)}</p>",
+                        unsafe_allow_html=True,
+                    )
+                # Metric 5: Negative
+                with col_neg:
+                    # Label for negative comments
+                    st.markdown(
+                        f"<p style='font-size: {label_font_size}; margin-bottom: 0px;'>Negative 👎</p>",
+                        unsafe_allow_html=True,
+                    )
+                    # The count of negative comments, red and bold
+                    st.markdown(
+                        f"<p style='font-size: {value_font_size}; font-weight: {value_font_weight}; color:{negative_color}; margin-top: 0px;'>{summary.get('negative', 0)}</p>",
+                        unsafe_allow_html=True,
+                    )
+                # Add a visual separator before charts
+                st.markdown("---")
+                # Data for charts - make sure it has counts > 0
+                if summary.get("num_valid_predictions", 0) > 0:
+                    # Prepare DataFrame for Plotly charts
+                    sentiment_data_for_plot = [
+                        {"Sentiment": "Positive", "Count": summary.get("positive", 0)},
+                        {"Sentiment": "Neutral", "Count": summary.get("neutral", 0)},
+                        {"Sentiment": "Negative", "Count": summary.get("negative", 0)},
+                    ]
+                    sentiment_counts_df = pd.DataFrame(sentiment_data_for_plot)
+                    # Filter out rows where Count is 0 for cleaner charts
+                    sentiment_counts_df_for_plot = sentiment_counts_df[
+                        sentiment_counts_df["Count"] > 0
+                    ].copy()
+                    # Define the color map for charts
+                    # Keys should match the 'Sentiment' column values
+                    color_map = {
+                        "Positive": "green",
+                        "Neutral": "grey",
+                        "Negative": "red",
+                    }
+                    if not sentiment_counts_df_for_plot.empty:
+                        st.subheader("📈 Sentiment Distribution Charts")
+                        # Try to use Plotly for richer charts
+                        if PLOTLY_AVAILABLE:
+                            try:
+                                # Pie Chart (Corrected data input for Plotly)
+                                # Plotly pie chart expects a DataFrame where one column is values, another is names
+                                fig_pie = px.pie(
+                                    sentiment_counts_df_for_plot,  # Use the filtered DataFrame
+                                    values="Count",  # Column for pie slice values
+                                    names="Sentiment",  # Column for pie slice names
+                                    title="Pie Chart: Comment Sentiments",
+                                    color="Sentiment",  # Color slices based on the 'Sentiment' category
+                                    color_discrete_map=color_map,
+                                )  # Apply custom colors
+                                fig_pie.update_traces(
+                                    textposition="inside",
+                                    textinfo="percent+label",
+                                    hovertemplate="Sentiment: %{label}<br>Count: %{value}<br>Percentage: %{percent}",
+                                )
+                                fig_pie.update_layout(
+                                    uniformtext_minsize=16, uniformtext_mode="hide"
+                                )
+                                st.plotly_chart(fig_pie, use_container_width=True)
+                                # Bar Chart (Using Plotly for consistent coloring)
+                                fig_bar = px.bar(
+                                    sentiment_counts_df_for_plot,  # Use the filtered DataFrame
+                                    x="Sentiment",  # Categories on X-axis
+                                    y="Count",  # Values on Y-axis
+                                    title="Bar Chart: Comment Sentiments",
+                                    color="Sentiment",  # Color bars based on 'Sentiment'
+                                    color_discrete_map=color_map,  # Apply custom colors
+                                    labels={
+                                        "Count": "Number of Comments",
+                                        "Sentiment": "Sentiment Category",
+                                    },
+                                )  # Custom labels
+                                st.plotly_chart(fig_bar, use_container_width=True)
+                            except Exception as plot_e:
+                                # Fallback if Plotly fails for some reason other than import
+                                st.error(
+                                    f"Sorry, couldn't create Plotly charts: {plot_e}"
+                                )
+                                st.write(
+                                    "Displaying basic bar chart instead (default colors):"
+                                )
+                                st.bar_chart(
+                                    sentiment_counts_df.set_index("Sentiment")
+                                )  # Fallback with original (unfiltered for bar)
+                        else:
+                            # Fallback to Streamlit's basic bar chart if Plotly is not installed
+                            st.write(
+                                "Displaying basic bar chart (Plotly not installed):"
+                            )
+                            st.bar_chart(
+                                sentiment_counts_df.set_index("Sentiment")
+                            )  # Basic bar chart
+                    else:
+                        # This message shows if all sentiment counts are zero
+                        st.write(
+                            "No sentiment data (Positive, Neutral, Negative all zero) to display in charts."
+                        )
+                else:
+                    # This message shows if no comments were analyzed successfully
+                    st.write(
+                        "Not enough valid sentiment data to display distribution charts."
+                    )
+                # Display comments and their sentiments
+                if comments_data:
+                    st.subheader(
+                        f"🔍 Analyzed Comments (showing first {len(comments_data)} results)"
+                    )
+                    comments_display_df = pd.DataFrame(comments_data)
+                    if "Confidence" in comments_display_df.columns:
+                        try:
+                            # Format as percentage with 1 decimal place
+                            comments_display_df["Confidence"] = comments_display_df[
+                                "Confidence"
+                            ].map("{:.1%}".format)
+                        except (TypeError, ValueError):
+                            st.warning(
+                                "Could not format confidence scores."
+                            )  # Handle potential errors if confidence is not numeric
+                    st.dataframe(
+                        comments_display_df, use_container_width=True, height=400
+                    )
+                else:
+                    st.write("No comments were analyzed to display.")
+            # else: # analyze_youtube_video already handles its own errors by showing st.error
+            #    st.info("Could not complete analysis. Please check the URL or try again.")
+        else:
+            # If user clicks button without entering URL
+            st.warning("Please enter a YouTube URL or Video ID first!")
+with tab_twitter:
+    st.header("Twitter/X Post Analysis")
+    st.info("This feature is currently under construction. Please check back later!")
+    # Placeholder for future Twitter input
+    # twitter_url_input = st.text_input("Enter Twitter/X Post URL:", key="twitter_url_input_key")
+    # if st.button("Analyze Tweets", key="twitter_analyze_button_key"):
+    #     st.write("Imagine amazing Twitter analysis happening here... Tweet tweet!")

src/youtube.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+import googleapiclient.discovery
+import googleapiclient.errors
+# from dotenv import load_dotenv
+import streamlit as st
+# load_dotenv()
+# api_key = os.getenv("API_KEY")
+api_key = st.secrets["API_KEY"]
+def get_comments(youtube, **kwargs):
+    comments = []
+    results = youtube.commentThreads().list(**kwargs).execute()
+    while results:
+        for item in results["items"]:
+            comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
+            comments.append(comment)
+        # check if there are more comments
+        if "nextPageToken" in results:
+            kwargs["pageToken"] = results["nextPageToken"]
+            results = youtube.commentThreads().list(**kwargs).execute()
+        else:
+            break
+    return comments
+def main(video_id, api_key):
+    # Disable OAuthlib's HTTPs verification when running locally.
+    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
+    youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)
+    video_title = "N/A"  # Provide a default title
+    try:
+        # Get video details using the videos().list endpoint
+        print(f"DEBUG (youtube.py): Fetching video details for ID: {video_id}")
+        video_response = (
+            youtube.videos()
+            .list(
+                part="snippet",  # 'snippet' contains title, description, channel etc.
+                id=video_id,  # The ID of the video we want info for
+            )
+            .execute()
+        )
+        # Extract the title from the response
+        # It's usually nested like this, good to check if 'items' exists
+        if video_response.get("items"):
+            video_title = video_response["items"][0]["snippet"]["title"]
+            print(f"DEBUG (youtube.py): Found title: '{video_title}'")  # Just a check
+        else:
+            print(f"WARN (youtube.py): No video items found for ID: {video_id}")
+            video_title = "Video Not Found or Private"  # More informative default
+    except Exception as e:
+        print(
+            f"ERROR (youtube.py): Failed to fetch video title for ID {video_id}. Error: {e}"
+        )
+        video_title = "Error Fetching Title"  # Error specific default
+        # Depending on requirements, maybe we still want to proceed to get comments?
+    comments = get_comments(
+        youtube, part="snippet", videoId=video_id, textFormat="plainText"
+    )
+    # return comments
+    # Return a dictionary containing both title and comments
+    return {"title": video_title, "comments": comments}
+def get_video_comments(video_id):
+    return main(video_id, api_key)