Spaces:
Sleeping
Sleeping
Commit
·
b505f70
1
Parent(s):
d0d7bec
working demo for wer
Browse files- .cursorignore +7 -0
- README.md +83 -4
- app.py +0 -159
- app_gradio.py +239 -0
- requirements.txt +1 -1
.cursorignore
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Add directories or file patterns to ignore during indexing (e.g. foo/ or *.csv)
|
2 |
+
venv/
|
3 |
+
*.log
|
4 |
+
*.log.*
|
5 |
+
*.log.*.*
|
6 |
+
*.log.*.*.*
|
7 |
+
*.log.*.*.*.*
|
README.md
CHANGED
@@ -3,15 +3,15 @@ title: WER Evaluation Tool
|
|
3 |
emoji: 🎯
|
4 |
colorFrom: blue
|
5 |
colorTo: red
|
6 |
-
sdk:
|
7 |
-
sdk_version:
|
8 |
-
app_file:
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
# WER Evaluation Tool
|
13 |
|
14 |
-
This
|
15 |
|
16 |
## Features
|
17 |
|
@@ -29,3 +29,82 @@ This Streamlit app provides a user-friendly interface for calculating Word Error
|
|
29 |
4. Click "Calculate WER" to see results
|
30 |
|
31 |
## Local Development
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
emoji: 🎯
|
4 |
colorFrom: blue
|
5 |
colorTo: red
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.16.0
|
8 |
+
app_file: app_gradio.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
# WER Evaluation Tool
|
13 |
|
14 |
+
This Gradio app provides a user-friendly interface for calculating Word Error Rate (WER) and related metrics between reference and hypothesis texts. It's particularly useful for evaluating speech recognition or machine translation outputs.
|
15 |
|
16 |
## Features
|
17 |
|
|
|
29 |
4. Click "Calculate WER" to see results
|
30 |
|
31 |
## Local Development
|
32 |
+
|
33 |
+
1. Clone the repository:
|
34 |
+
```bash
|
35 |
+
git clone https://github.com/yourusername/wer-evaluation-tool.git
|
36 |
+
cd wer-evaluation-tool
|
37 |
+
```
|
38 |
+
|
39 |
+
2. Create and activate a virtual environment using `uv`:
|
40 |
+
```bash
|
41 |
+
uv venv
|
42 |
+
source .venv/bin/activate # On Unix/macOS
|
43 |
+
# or
|
44 |
+
.venv\Scripts\activate # On Windows
|
45 |
+
```
|
46 |
+
|
47 |
+
3. Install dependencies:
|
48 |
+
```bash
|
49 |
+
uv pip install -r requirements.txt
|
50 |
+
```
|
51 |
+
|
52 |
+
4. Run the app locally:
|
53 |
+
```bash
|
54 |
+
uv run python app_gradio.py
|
55 |
+
```
|
56 |
+
|
57 |
+
## Installation
|
58 |
+
|
59 |
+
You can install the package directly from PyPI:
|
60 |
+
|
61 |
+
```bash
|
62 |
+
uv pip install wer-evaluation-tool
|
63 |
+
```
|
64 |
+
|
65 |
+
## Testing
|
66 |
+
|
67 |
+
Run the test suite using pytest:
|
68 |
+
|
69 |
+
```bash
|
70 |
+
uv run pytest tests/
|
71 |
+
```
|
72 |
+
|
73 |
+
## Contributing
|
74 |
+
|
75 |
+
1. Fork the repository
|
76 |
+
2. Create a new branch (`git checkout -b feature/improvement`)
|
77 |
+
3. Make your changes
|
78 |
+
4. Run tests to ensure everything works
|
79 |
+
5. Commit your changes (`git commit -am 'Add new feature'`)
|
80 |
+
6. Push to the branch (`git push origin feature/improvement`)
|
81 |
+
7. Create a Pull Request
|
82 |
+
|
83 |
+
## License
|
84 |
+
|
85 |
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
86 |
+
|
87 |
+
## Acknowledgments
|
88 |
+
|
89 |
+
- Thanks to all contributors who have helped with the development
|
90 |
+
- Inspired by the need for better speech recognition evaluation tools
|
91 |
+
- Built with [Gradio](https://gradio.app/)
|
92 |
+
|
93 |
+
## Contact
|
94 |
+
|
95 |
+
For questions or feedback, please:
|
96 |
+
- Open an issue in the GitHub repository
|
97 |
+
- Contact the maintainers at [email/contact information]
|
98 |
+
|
99 |
+
## Citation
|
100 |
+
|
101 |
+
If you use this tool in your research, please cite:
|
102 |
+
|
103 |
+
```bibtex
|
104 |
+
@software{wer_evaluation_tool,
|
105 |
+
title = {WER Evaluation Tool},
|
106 |
+
author = {Your Name},
|
107 |
+
year = {2024},
|
108 |
+
url = {https://github.com/yourusername/wer-evaluation-tool}
|
109 |
+
}
|
110 |
+
```
|
app.py
DELETED
@@ -1,159 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import jiwer
|
3 |
-
import pandas as pd
|
4 |
-
from typing import List, Optional
|
5 |
-
|
6 |
-
def calculate_wer_metrics(
|
7 |
-
hypothesis: str,
|
8 |
-
reference: str,
|
9 |
-
normalize: bool = True,
|
10 |
-
words_to_filter: Optional[List[str]] = None
|
11 |
-
) -> dict:
|
12 |
-
"""
|
13 |
-
Calculate WER metrics between hypothesis and reference texts.
|
14 |
-
|
15 |
-
Args:
|
16 |
-
hypothesis (str): The hypothesis text
|
17 |
-
reference (str): The reference text
|
18 |
-
normalize (bool): Whether to normalize texts before comparison
|
19 |
-
words_to_filter (List[str], optional): Words to filter out before comparison
|
20 |
-
|
21 |
-
Returns:
|
22 |
-
dict: Dictionary containing WER metrics
|
23 |
-
"""
|
24 |
-
# Create transformation pipeline
|
25 |
-
if normalize:
|
26 |
-
transformation = jiwer.Compose([
|
27 |
-
jiwer.ToLowerCase(),
|
28 |
-
jiwer.RemoveMultipleSpaces(),
|
29 |
-
jiwer.RemovePunctuation(),
|
30 |
-
jiwer.Strip()
|
31 |
-
])
|
32 |
-
|
33 |
-
# Add custom word filtering if specified
|
34 |
-
if words_to_filter:
|
35 |
-
transformation = jiwer.Compose([
|
36 |
-
transformation,
|
37 |
-
lambda x: ' '.join(word for word in x.split()
|
38 |
-
if word.lower() not in [w.lower() for w in words_to_filter])
|
39 |
-
])
|
40 |
-
else:
|
41 |
-
transformation = None
|
42 |
-
|
43 |
-
# Calculate WER measures
|
44 |
-
measures = jiwer.compute_measures(
|
45 |
-
truth=reference,
|
46 |
-
hypothesis=hypothesis,
|
47 |
-
truth_transform=transformation,
|
48 |
-
hypothesis_transform=transformation
|
49 |
-
)
|
50 |
-
|
51 |
-
return measures
|
52 |
-
|
53 |
-
def main():
|
54 |
-
st.set_page_config(
|
55 |
-
page_title="WER Evaluation Tool",
|
56 |
-
page_icon="🎯",
|
57 |
-
layout="wide"
|
58 |
-
)
|
59 |
-
|
60 |
-
st.title("Word Error Rate (WER) Evaluation Tool")
|
61 |
-
st.markdown("""
|
62 |
-
This tool helps you evaluate the Word Error Rate (WER) between a reference text and a hypothesis text.
|
63 |
-
WER is commonly used in speech recognition and machine translation evaluation.
|
64 |
-
""")
|
65 |
-
|
66 |
-
# Example button
|
67 |
-
if st.button("Load Example"):
|
68 |
-
reference = "the quick brown fox jumps over the lazy dog"
|
69 |
-
hypothesis = "the quick brown fox jumped over lazy dog"
|
70 |
-
else:
|
71 |
-
reference = ""
|
72 |
-
hypothesis = ""
|
73 |
-
|
74 |
-
# Input fields
|
75 |
-
col1, col2 = st.columns(2)
|
76 |
-
|
77 |
-
with col1:
|
78 |
-
reference = st.text_area(
|
79 |
-
"Reference Text",
|
80 |
-
value=reference,
|
81 |
-
height=150,
|
82 |
-
placeholder="Enter the reference text here..."
|
83 |
-
)
|
84 |
-
|
85 |
-
with col2:
|
86 |
-
hypothesis = st.text_area(
|
87 |
-
"Hypothesis Text",
|
88 |
-
value=hypothesis,
|
89 |
-
height=150,
|
90 |
-
placeholder="Enter the hypothesis text here..."
|
91 |
-
)
|
92 |
-
|
93 |
-
# Options
|
94 |
-
normalize = st.checkbox("Normalize text (lowercase, remove punctuation)", value=True)
|
95 |
-
|
96 |
-
words_to_filter_input = st.text_input(
|
97 |
-
"Words to filter (comma-separated)",
|
98 |
-
placeholder="e.g., um, uh, ah"
|
99 |
-
)
|
100 |
-
|
101 |
-
words_to_filter = [word.strip() for word in words_to_filter_input.split(",")] if words_to_filter_input else None
|
102 |
-
|
103 |
-
# Calculate button
|
104 |
-
if st.button("Calculate WER"):
|
105 |
-
if not reference or not hypothesis:
|
106 |
-
st.error("Please provide both reference and hypothesis texts.")
|
107 |
-
return
|
108 |
-
|
109 |
-
try:
|
110 |
-
measures = calculate_wer_metrics(
|
111 |
-
hypothesis=hypothesis,
|
112 |
-
reference=reference,
|
113 |
-
normalize=normalize,
|
114 |
-
words_to_filter=words_to_filter
|
115 |
-
)
|
116 |
-
|
117 |
-
# Display results
|
118 |
-
col1, col2 = st.columns(2)
|
119 |
-
|
120 |
-
with col1:
|
121 |
-
st.subheader("Main Metrics")
|
122 |
-
metrics_df = pd.DataFrame({
|
123 |
-
'Metric': ['WER', 'MER', 'WIL', 'WIP'],
|
124 |
-
'Value': [
|
125 |
-
f"{measures['wer']:.3f}",
|
126 |
-
f"{measures['mer']:.3f}",
|
127 |
-
f"{measures['wil']:.3f}",
|
128 |
-
f"{measures['wip']:.3f}"
|
129 |
-
]
|
130 |
-
})
|
131 |
-
st.table(metrics_df)
|
132 |
-
|
133 |
-
with col2:
|
134 |
-
st.subheader("Error Analysis")
|
135 |
-
error_df = pd.DataFrame({
|
136 |
-
'Metric': ['Substitutions', 'Deletions', 'Insertions', 'Hits'],
|
137 |
-
'Count': [
|
138 |
-
measures['substitutions'],
|
139 |
-
measures['deletions'],
|
140 |
-
measures['insertions'],
|
141 |
-
measures['hits']
|
142 |
-
]
|
143 |
-
})
|
144 |
-
st.table(error_df)
|
145 |
-
|
146 |
-
# Add explanations
|
147 |
-
st.markdown("""
|
148 |
-
### Metrics Explanation:
|
149 |
-
- **WER (Word Error Rate)**: The percentage of words that were incorrectly predicted
|
150 |
-
- **MER (Match Error Rate)**: The percentage of words that were incorrectly matched
|
151 |
-
- **WIL (Word Information Lost)**: The percentage of word information that was lost
|
152 |
-
- **WIP (Word Information Preserved)**: The percentage of word information that was preserved
|
153 |
-
""")
|
154 |
-
|
155 |
-
except Exception as e:
|
156 |
-
st.error(f"Error calculating WER: {str(e)}")
|
157 |
-
|
158 |
-
if __name__ == "__main__":
|
159 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app_gradio.py
ADDED
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import jiwer
|
3 |
+
import pandas as pd
|
4 |
+
import logging
|
5 |
+
from typing import List, Optional, Tuple, Dict
|
6 |
+
|
7 |
+
# Set up logging configuration
|
8 |
+
logging.basicConfig(
|
9 |
+
level=logging.INFO,
|
10 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
11 |
+
force=True,
|
12 |
+
handlers=[
|
13 |
+
logging.StreamHandler(),
|
14 |
+
]
|
15 |
+
)
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
+
|
18 |
+
def calculate_wer_metrics(
|
19 |
+
hypothesis: str,
|
20 |
+
reference: str,
|
21 |
+
normalize: bool = True,
|
22 |
+
words_to_filter: Optional[List[str]] = None
|
23 |
+
) -> Dict:
|
24 |
+
"""
|
25 |
+
Calculate WER metrics between hypothesis and reference texts.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
hypothesis (str): The hypothesis text
|
29 |
+
reference (str): The reference text
|
30 |
+
normalize (bool): Whether to normalize texts before comparison
|
31 |
+
words_to_filter (List[str], optional): Words to filter out before comparison
|
32 |
+
|
33 |
+
Returns:
|
34 |
+
dict: Dictionary containing WER metrics
|
35 |
+
|
36 |
+
Raises:
|
37 |
+
ValueError: If inputs are invalid or result in empty text after processing
|
38 |
+
"""
|
39 |
+
logger.info(f"Calculating WER metrics with inputs - Hypothesis: {hypothesis}, Reference: {reference}")
|
40 |
+
|
41 |
+
# Validate inputs
|
42 |
+
if not hypothesis.strip() or not reference.strip():
|
43 |
+
raise ValueError("Both hypothesis and reference texts must contain non-empty strings")
|
44 |
+
|
45 |
+
if normalize:
|
46 |
+
# Define basic transformations
|
47 |
+
basic_transform = jiwer.Compose([
|
48 |
+
jiwer.ExpandCommonEnglishContractions(),
|
49 |
+
jiwer.ToLowerCase(),
|
50 |
+
jiwer.RemoveMultipleSpaces(),
|
51 |
+
jiwer.RemovePunctuation(),
|
52 |
+
jiwer.Strip(),
|
53 |
+
jiwer.ReduceToListOfListOfWords()
|
54 |
+
])
|
55 |
+
|
56 |
+
if words_to_filter and any(words_to_filter):
|
57 |
+
def filter_words_transform(words: List[str]) -> List[str]:
|
58 |
+
filtered = [word for word in words
|
59 |
+
if word.lower() not in [w.lower() for w in words_to_filter]]
|
60 |
+
if not filtered:
|
61 |
+
raise ValueError("Text is empty after filtering words")
|
62 |
+
return filtered
|
63 |
+
|
64 |
+
transformation = jiwer.Compose([
|
65 |
+
basic_transform,
|
66 |
+
filter_words_transform
|
67 |
+
])
|
68 |
+
else:
|
69 |
+
transformation = basic_transform
|
70 |
+
|
71 |
+
# Pre-check the transformed text
|
72 |
+
try:
|
73 |
+
transformed_ref = transformation(reference)
|
74 |
+
transformed_hyp = transformation(hypothesis)
|
75 |
+
if not transformed_ref or not transformed_hyp:
|
76 |
+
raise ValueError("Text is empty after normalization")
|
77 |
+
logger.debug(f"Transformed reference: {transformed_ref}")
|
78 |
+
logger.debug(f"Transformed hypothesis: {transformed_hyp}")
|
79 |
+
except Exception as e:
|
80 |
+
logger.error(f"Transformation error: {str(e)}")
|
81 |
+
raise ValueError(f"Error during text transformation: {str(e)}")
|
82 |
+
|
83 |
+
measures = jiwer.compute_measures(
|
84 |
+
truth=reference,
|
85 |
+
hypothesis=hypothesis,
|
86 |
+
truth_transform=transformation,
|
87 |
+
hypothesis_transform=transformation
|
88 |
+
)
|
89 |
+
else:
|
90 |
+
measures = jiwer.compute_measures(
|
91 |
+
truth=reference,
|
92 |
+
hypothesis=hypothesis
|
93 |
+
)
|
94 |
+
|
95 |
+
return measures
|
96 |
+
|
97 |
+
def process_inputs(
|
98 |
+
reference: str,
|
99 |
+
hypothesis: str,
|
100 |
+
normalize: bool,
|
101 |
+
words_to_filter: str
|
102 |
+
) -> Tuple[str, str, str, str]:
|
103 |
+
"""
|
104 |
+
Process inputs and calculate WER metrics.
|
105 |
+
|
106 |
+
Args:
|
107 |
+
reference (str): Reference text
|
108 |
+
hypothesis (str): Hypothesis text
|
109 |
+
normalize (bool): Whether to normalize text
|
110 |
+
words_to_filter (str): Comma-separated words to filter
|
111 |
+
|
112 |
+
Returns:
|
113 |
+
Tuple[str, str, str, str]: HTML formatted main metrics, error analysis,
|
114 |
+
and explanations
|
115 |
+
"""
|
116 |
+
if not reference or not hypothesis:
|
117 |
+
return "Please provide both reference and hypothesis texts.", "", "", ""
|
118 |
+
|
119 |
+
try:
|
120 |
+
filter_words = [word.strip() for word in words_to_filter.split(",")] if words_to_filter else None
|
121 |
+
measures = calculate_wer_metrics(
|
122 |
+
hypothesis=hypothesis,
|
123 |
+
reference=reference,
|
124 |
+
normalize=normalize,
|
125 |
+
words_to_filter=filter_words
|
126 |
+
)
|
127 |
+
|
128 |
+
# Format main metrics
|
129 |
+
metrics_df = pd.DataFrame({
|
130 |
+
'Metric': ['WER', 'MER', 'WIL', 'WIP'],
|
131 |
+
'Value': [
|
132 |
+
f"{measures['wer']:.3f}",
|
133 |
+
f"{measures['mer']:.3f}",
|
134 |
+
f"{measures['wil']:.3f}",
|
135 |
+
f"{measures['wip']:.3f}"
|
136 |
+
]
|
137 |
+
})
|
138 |
+
|
139 |
+
# Format error analysis
|
140 |
+
error_df = pd.DataFrame({
|
141 |
+
'Metric': ['Substitutions', 'Deletions', 'Insertions', 'Hits'],
|
142 |
+
'Count': [
|
143 |
+
measures['substitutions'],
|
144 |
+
measures['deletions'],
|
145 |
+
measures['insertions'],
|
146 |
+
measures['hits']
|
147 |
+
]
|
148 |
+
})
|
149 |
+
|
150 |
+
metrics_html = metrics_df.to_html(index=False)
|
151 |
+
error_html = error_df.to_html(index=False)
|
152 |
+
|
153 |
+
explanation = """
|
154 |
+
<h3>Metrics Explanation:</h3>
|
155 |
+
<ul>
|
156 |
+
<li><b>WER (Word Error Rate)</b>: The percentage of words that were incorrectly predicted</li>
|
157 |
+
<li><b>MER (Match Error Rate)</b>: The percentage of words that were incorrectly matched</li>
|
158 |
+
<li><b>WIL (Word Information Lost)</b>: The percentage of word information that was lost</li>
|
159 |
+
<li><b>WIP (Word Information Preserved)</b>: The percentage of word information that was preserved</li>
|
160 |
+
</ul>
|
161 |
+
"""
|
162 |
+
|
163 |
+
return metrics_html, error_html, explanation, ""
|
164 |
+
|
165 |
+
except Exception as e:
|
166 |
+
error_msg = f"Error calculating WER: {str(e)}"
|
167 |
+
logger.error(error_msg)
|
168 |
+
return "", "", "", error_msg
|
169 |
+
|
170 |
+
def load_example() -> Tuple[str, str]:
|
171 |
+
"""Load example texts for demonstration."""
|
172 |
+
return (
|
173 |
+
"the quick brown fox jumps over the lazy dog",
|
174 |
+
"the quick brown fox jumped over lazy dog"
|
175 |
+
)
|
176 |
+
|
177 |
+
def create_interface() -> gr.Blocks:
|
178 |
+
"""Create the Gradio interface."""
|
179 |
+
with gr.Blocks(title="WER Evaluation Tool") as interface:
|
180 |
+
gr.Markdown("# Word Error Rate (WER) Evaluation Tool")
|
181 |
+
gr.Markdown(
|
182 |
+
"This tool helps you evaluate the Word Error Rate (WER) between a reference "
|
183 |
+
"text and a hypothesis text. WER is commonly used in speech recognition and "
|
184 |
+
"machine translation evaluation."
|
185 |
+
)
|
186 |
+
|
187 |
+
with gr.Row():
|
188 |
+
with gr.Column():
|
189 |
+
reference = gr.Textbox(
|
190 |
+
label="Reference Text",
|
191 |
+
placeholder="Enter the reference text here...",
|
192 |
+
lines=5
|
193 |
+
)
|
194 |
+
with gr.Column():
|
195 |
+
hypothesis = gr.Textbox(
|
196 |
+
label="Hypothesis Text",
|
197 |
+
placeholder="Enter the hypothesis text here...",
|
198 |
+
lines=5
|
199 |
+
)
|
200 |
+
|
201 |
+
with gr.Row():
|
202 |
+
normalize = gr.Checkbox(
|
203 |
+
label="Normalize text (lowercase, remove punctuation)",
|
204 |
+
value=True
|
205 |
+
)
|
206 |
+
words_to_filter = gr.Textbox(
|
207 |
+
label="Words to filter (comma-separated)",
|
208 |
+
placeholder="e.g., um, uh, ah"
|
209 |
+
)
|
210 |
+
|
211 |
+
with gr.Row():
|
212 |
+
example_btn = gr.Button("Load Example")
|
213 |
+
calculate_btn = gr.Button("Calculate WER", variant="primary")
|
214 |
+
|
215 |
+
with gr.Row():
|
216 |
+
metrics_output = gr.HTML(label="Main Metrics")
|
217 |
+
error_output = gr.HTML(label="Error Analysis")
|
218 |
+
|
219 |
+
explanation_output = gr.HTML()
|
220 |
+
error_msg_output = gr.HTML()
|
221 |
+
|
222 |
+
# Event handlers
|
223 |
+
example_btn.click(
|
224 |
+
load_example,
|
225 |
+
outputs=[reference, hypothesis]
|
226 |
+
)
|
227 |
+
|
228 |
+
calculate_btn.click(
|
229 |
+
process_inputs,
|
230 |
+
inputs=[reference, hypothesis, normalize, words_to_filter],
|
231 |
+
outputs=[metrics_output, error_output, explanation_output, error_msg_output]
|
232 |
+
)
|
233 |
+
|
234 |
+
return interface
|
235 |
+
|
236 |
+
if __name__ == "__main__":
|
237 |
+
logger.info("Application started")
|
238 |
+
app = create_interface()
|
239 |
+
app.launch()
|
requirements.txt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
-
|
2 |
jiwer==3.1.0
|
3 |
pandas==2.2.0
|
|
|
1 |
+
gradio==5.16.0
|
2 |
jiwer==3.1.0
|
3 |
pandas==2.2.0
|