Spaces:

OUTCOMESAI
/

WER

Sleeping

App Files Files Community

piyushmaharana commited on Feb 12

Commit

b505f70

1 Parent(s): d0d7bec

working demo for wer

Browse files

Files changed (5) hide show

.cursorignore +7 -0
README.md +83 -4
app.py +0 -159
app_gradio.py +239 -0
requirements.txt +1 -1

.cursorignore ADDED Viewed

	@@ -0,0 +1,7 @@

+# Add directories or file patterns to ignore during indexing (e.g. foo/ or *.csv)
+venv/
+*.log
+*.log.*
+*.log.*.*
+*.log.*.*.*
+*.log.*.*.*.*

README.md CHANGED Viewed

@@ -3,15 +3,15 @@ title: WER Evaluation Tool
 emoji: 🎯
 colorFrom: blue
 colorTo: red
-sdk: streamlit
-sdk_version: 1.31.1
-app_file: app.py
 pinned: false
 ---
 # WER Evaluation Tool
-This Streamlit app provides a user-friendly interface for calculating Word Error Rate (WER) and related metrics between reference and hypothesis texts. It's particularly useful for evaluating speech recognition or machine translation outputs.
 ## Features
@@ -29,3 +29,82 @@ This Streamlit app provides a user-friendly interface for calculating Word Error
 4. Click "Calculate WER" to see results
 ## Local Development

 emoji: 🎯
 colorFrom: blue
 colorTo: red
+sdk: gradio
+sdk_version: 5.16.0
+app_file: app_gradio.py
 pinned: false
 ---
 # WER Evaluation Tool
+This Gradio app provides a user-friendly interface for calculating Word Error Rate (WER) and related metrics between reference and hypothesis texts. It's particularly useful for evaluating speech recognition or machine translation outputs.
 ## Features
 4. Click "Calculate WER" to see results
 ## Local Development
+1. Clone the repository:
+```bash
+git clone https://github.com/yourusername/wer-evaluation-tool.git
+cd wer-evaluation-tool
+```
+2. Create and activate a virtual environment using `uv`:
+```bash
+uv venv
+source .venv/bin/activate  # On Unix/macOS
+# or
+.venv\Scripts\activate  # On Windows
+```
+3. Install dependencies:
+```bash
+uv pip install -r requirements.txt
+```
+4. Run the app locally:
+```bash
+uv run python app_gradio.py
+```
+## Installation
+You can install the package directly from PyPI:
+```bash
+uv pip install wer-evaluation-tool
+```
+## Testing
+Run the test suite using pytest:
+```bash
+uv run pytest tests/
+```
+## Contributing
+1. Fork the repository
+2. Create a new branch (`git checkout -b feature/improvement`)
+3. Make your changes
+4. Run tests to ensure everything works
+5. Commit your changes (`git commit -am 'Add new feature'`)
+6. Push to the branch (`git push origin feature/improvement`)
+7. Create a Pull Request
+## License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+## Acknowledgments
+- Thanks to all contributors who have helped with the development
+- Inspired by the need for better speech recognition evaluation tools
+- Built with [Gradio](https://gradio.app/)
+## Contact
+For questions or feedback, please:
+- Open an issue in the GitHub repository
+- Contact the maintainers at [email/contact information]
+## Citation
+If you use this tool in your research, please cite:
+```bibtex
+@software{wer_evaluation_tool,
+  title = {WER Evaluation Tool},
+  author = {Your Name},
+  year = {2024},
+  url = {https://github.com/yourusername/wer-evaluation-tool}
+}
+```

app.py DELETED Viewed

@@ -1,159 +0,0 @@
-import streamlit as st
-import jiwer
-import pandas as pd
-from typing import List, Optional
-def calculate_wer_metrics(
-    hypothesis: str,
-    reference: str,
-    normalize: bool = True,
-    words_to_filter: Optional[List[str]] = None
-) -> dict:
-    """
-    Calculate WER metrics between hypothesis and reference texts.
-    Args:
-        hypothesis (str): The hypothesis text
-        reference (str): The reference text
-        normalize (bool): Whether to normalize texts before comparison
-        words_to_filter (List[str], optional): Words to filter out before comparison
-    Returns:
-        dict: Dictionary containing WER metrics
-    """
-    # Create transformation pipeline
-    if normalize:
-        transformation = jiwer.Compose([
-            jiwer.ToLowerCase(),
-            jiwer.RemoveMultipleSpaces(),
-            jiwer.RemovePunctuation(),
-            jiwer.Strip()
-        ])
-        # Add custom word filtering if specified
-        if words_to_filter:
-            transformation = jiwer.Compose([
-                transformation,
-                lambda x: ' '.join(word for word in x.split()
-                                 if word.lower() not in [w.lower() for w in words_to_filter])
-            ])
-    else:
-        transformation = None
-    # Calculate WER measures
-    measures = jiwer.compute_measures(
-        truth=reference,
-        hypothesis=hypothesis,
-        truth_transform=transformation,
-        hypothesis_transform=transformation
-    )
-    return measures
-def main():
-    st.set_page_config(
-        page_title="WER Evaluation Tool",
-        page_icon="🎯",
-        layout="wide"
-    )
-    st.title("Word Error Rate (WER) Evaluation Tool")
-    st.markdown("""
-    This tool helps you evaluate the Word Error Rate (WER) between a reference text and a hypothesis text.
-    WER is commonly used in speech recognition and machine translation evaluation.
-    """)
-    # Example button
-    if st.button("Load Example"):
-        reference = "the quick brown fox jumps over the lazy dog"
-        hypothesis = "the quick brown fox jumped over lazy dog"
-    else:
-        reference = ""
-        hypothesis = ""
-    # Input fields
-    col1, col2 = st.columns(2)
-    with col1:
-        reference = st.text_area(
-            "Reference Text",
-            value=reference,
-            height=150,
-            placeholder="Enter the reference text here..."
-        )
-    with col2:
-        hypothesis = st.text_area(
-            "Hypothesis Text",
-            value=hypothesis,
-            height=150,
-            placeholder="Enter the hypothesis text here..."
-        )
-    # Options
-    normalize = st.checkbox("Normalize text (lowercase, remove punctuation)", value=True)
-    words_to_filter_input = st.text_input(
-        "Words to filter (comma-separated)",
-        placeholder="e.g., um, uh, ah"
-    )
-    words_to_filter = [word.strip() for word in words_to_filter_input.split(",")] if words_to_filter_input else None
-    # Calculate button
-    if st.button("Calculate WER"):
-        if not reference or not hypothesis:
-            st.error("Please provide both reference and hypothesis texts.")
-            return
-        try:
-            measures = calculate_wer_metrics(
-                hypothesis=hypothesis,
-                reference=reference,
-                normalize=normalize,
-                words_to_filter=words_to_filter
-            )
-            # Display results
-            col1, col2 = st.columns(2)
-            with col1:
-                st.subheader("Main Metrics")
-                metrics_df = pd.DataFrame({
-                    'Metric': ['WER', 'MER', 'WIL', 'WIP'],
-                    'Value': [
-                        f"{measures['wer']:.3f}",
-                        f"{measures['mer']:.3f}",
-                        f"{measures['wil']:.3f}",
-                        f"{measures['wip']:.3f}"
-                    ]
-                })
-                st.table(metrics_df)
-            with col2:
-                st.subheader("Error Analysis")
-                error_df = pd.DataFrame({
-                    'Metric': ['Substitutions', 'Deletions', 'Insertions', 'Hits'],
-                    'Count': [
-                        measures['substitutions'],
-                        measures['deletions'],
-                        measures['insertions'],
-                        measures['hits']
-                    ]
-                })
-                st.table(error_df)
-            # Add explanations
-            st.markdown("""
-            ### Metrics Explanation:
-            - **WER (Word Error Rate)**: The percentage of words that were incorrectly predicted
-            - **MER (Match Error Rate)**: The percentage of words that were incorrectly matched
-            - **WIL (Word Information Lost)**: The percentage of word information that was lost
-            - **WIP (Word Information Preserved)**: The percentage of word information that was preserved
-            """)
-        except Exception as e:
-            st.error(f"Error calculating WER: {str(e)}")
-if __name__ == "__main__":
-    main()

app_gradio.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import gradio as gr
+import jiwer
+import pandas as pd
+import logging
+from typing import List, Optional, Tuple, Dict
+# Set up logging configuration
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    force=True,
+    handlers=[
+        logging.StreamHandler(),
+    ]
+)
+logger = logging.getLogger(__name__)
+def calculate_wer_metrics(
+    hypothesis: str,
+    reference: str,
+    normalize: bool = True,
+    words_to_filter: Optional[List[str]] = None
+) -> Dict:
+    """
+    Calculate WER metrics between hypothesis and reference texts.
+    Args:
+        hypothesis (str): The hypothesis text
+        reference (str): The reference text
+        normalize (bool): Whether to normalize texts before comparison
+        words_to_filter (List[str], optional): Words to filter out before comparison
+    Returns:
+        dict: Dictionary containing WER metrics
+    Raises:
+        ValueError: If inputs are invalid or result in empty text after processing
+    """
+    logger.info(f"Calculating WER metrics with inputs - Hypothesis: {hypothesis}, Reference: {reference}")
+    # Validate inputs
+    if not hypothesis.strip() or not reference.strip():
+        raise ValueError("Both hypothesis and reference texts must contain non-empty strings")
+    if normalize:
+        # Define basic transformations
+        basic_transform = jiwer.Compose([
+            jiwer.ExpandCommonEnglishContractions(),
+            jiwer.ToLowerCase(),
+            jiwer.RemoveMultipleSpaces(),
+            jiwer.RemovePunctuation(),
+            jiwer.Strip(),
+            jiwer.ReduceToListOfListOfWords()
+        ])
+        if words_to_filter and any(words_to_filter):
+            def filter_words_transform(words: List[str]) -> List[str]:
+                filtered = [word for word in words
+                          if word.lower() not in [w.lower() for w in words_to_filter]]
+                if not filtered:
+                    raise ValueError("Text is empty after filtering words")
+                return filtered
+            transformation = jiwer.Compose([
+                basic_transform,
+                filter_words_transform
+            ])
+        else:
+            transformation = basic_transform
+        # Pre-check the transformed text
+        try:
+            transformed_ref = transformation(reference)
+            transformed_hyp = transformation(hypothesis)
+            if not transformed_ref or not transformed_hyp:
+                raise ValueError("Text is empty after normalization")
+            logger.debug(f"Transformed reference: {transformed_ref}")
+            logger.debug(f"Transformed hypothesis: {transformed_hyp}")
+        except Exception as e:
+            logger.error(f"Transformation error: {str(e)}")
+            raise ValueError(f"Error during text transformation: {str(e)}")
+        measures = jiwer.compute_measures(
+            truth=reference,
+            hypothesis=hypothesis,
+            truth_transform=transformation,
+            hypothesis_transform=transformation
+        )
+    else:
+        measures = jiwer.compute_measures(
+            truth=reference,
+            hypothesis=hypothesis
+        )
+    return measures
+def process_inputs(
+    reference: str,
+    hypothesis: str,
+    normalize: bool,
+    words_to_filter: str
+) -> Tuple[str, str, str, str]:
+    """
+    Process inputs and calculate WER metrics.
+    Args:
+        reference (str): Reference text
+        hypothesis (str): Hypothesis text
+        normalize (bool): Whether to normalize text
+        words_to_filter (str): Comma-separated words to filter
+    Returns:
+        Tuple[str, str, str, str]: HTML formatted main metrics, error analysis,
+                                  and explanations
+    """
+    if not reference or not hypothesis:
+        return "Please provide both reference and hypothesis texts.", "", "", ""
+    try:
+        filter_words = [word.strip() for word in words_to_filter.split(",")] if words_to_filter else None
+        measures = calculate_wer_metrics(
+            hypothesis=hypothesis,
+            reference=reference,
+            normalize=normalize,
+            words_to_filter=filter_words
+        )
+        # Format main metrics
+        metrics_df = pd.DataFrame({
+            'Metric': ['WER', 'MER', 'WIL', 'WIP'],
+            'Value': [
+                f"{measures['wer']:.3f}",
+                f"{measures['mer']:.3f}",
+                f"{measures['wil']:.3f}",
+                f"{measures['wip']:.3f}"
+            ]
+        })
+        # Format error analysis
+        error_df = pd.DataFrame({
+            'Metric': ['Substitutions', 'Deletions', 'Insertions', 'Hits'],
+            'Count': [
+                measures['substitutions'],
+                measures['deletions'],
+                measures['insertions'],
+                measures['hits']
+            ]
+        })
+        metrics_html = metrics_df.to_html(index=False)
+        error_html = error_df.to_html(index=False)
+        explanation = """
+        <h3>Metrics Explanation:</h3>
+        <ul>
+            <li><b>WER (Word Error Rate)</b>: The percentage of words that were incorrectly predicted</li>
+            <li><b>MER (Match Error Rate)</b>: The percentage of words that were incorrectly matched</li>
+            <li><b>WIL (Word Information Lost)</b>: The percentage of word information that was lost</li>
+            <li><b>WIP (Word Information Preserved)</b>: The percentage of word information that was preserved</li>
+        </ul>
+        """
+        return metrics_html, error_html, explanation, ""
+    except Exception as e:
+        error_msg = f"Error calculating WER: {str(e)}"
+        logger.error(error_msg)
+        return "", "", "", error_msg
+def load_example() -> Tuple[str, str]:
+    """Load example texts for demonstration."""
+    return (
+        "the quick brown fox jumps over the lazy dog",
+        "the quick brown fox jumped over lazy dog"
+    )
+def create_interface() -> gr.Blocks:
+    """Create the Gradio interface."""
+    with gr.Blocks(title="WER Evaluation Tool") as interface:
+        gr.Markdown("# Word Error Rate (WER) Evaluation Tool")
+        gr.Markdown(
+            "This tool helps you evaluate the Word Error Rate (WER) between a reference "
+            "text and a hypothesis text. WER is commonly used in speech recognition and "
+            "machine translation evaluation."
+        )
+        with gr.Row():
+            with gr.Column():
+                reference = gr.Textbox(
+                    label="Reference Text",
+                    placeholder="Enter the reference text here...",
+                    lines=5
+                )
+            with gr.Column():
+                hypothesis = gr.Textbox(
+                    label="Hypothesis Text",
+                    placeholder="Enter the hypothesis text here...",
+                    lines=5
+                )
+        with gr.Row():
+            normalize = gr.Checkbox(
+                label="Normalize text (lowercase, remove punctuation)",
+                value=True
+            )
+            words_to_filter = gr.Textbox(
+                label="Words to filter (comma-separated)",
+                placeholder="e.g., um, uh, ah"
+            )
+        with gr.Row():
+            example_btn = gr.Button("Load Example")
+            calculate_btn = gr.Button("Calculate WER", variant="primary")
+        with gr.Row():
+            metrics_output = gr.HTML(label="Main Metrics")
+            error_output = gr.HTML(label="Error Analysis")
+        explanation_output = gr.HTML()
+        error_msg_output = gr.HTML()
+        # Event handlers
+        example_btn.click(
+            load_example,
+            outputs=[reference, hypothesis]
+        )
+        calculate_btn.click(
+            process_inputs,
+            inputs=[reference, hypothesis, normalize, words_to_filter],
+            outputs=[metrics_output, error_output, explanation_output, error_msg_output]
+        )
+    return interface
+if __name__ == "__main__":
+    logger.info("Application started")
+    app = create_interface()
+    app.launch()

requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
-streamlit==1.31.1
 jiwer==3.1.0
 pandas==2.2.0

+gradio==5.16.0
 jiwer==3.1.0
 pandas==2.2.0