piyushmaharana commited on
Commit
b505f70
·
1 Parent(s): d0d7bec

working demo for wer

Browse files
Files changed (5) hide show
  1. .cursorignore +7 -0
  2. README.md +83 -4
  3. app.py +0 -159
  4. app_gradio.py +239 -0
  5. requirements.txt +1 -1
.cursorignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Add directories or file patterns to ignore during indexing (e.g. foo/ or *.csv)
2
+ venv/
3
+ *.log
4
+ *.log.*
5
+ *.log.*.*
6
+ *.log.*.*.*
7
+ *.log.*.*.*.*
README.md CHANGED
@@ -3,15 +3,15 @@ title: WER Evaluation Tool
3
  emoji: 🎯
4
  colorFrom: blue
5
  colorTo: red
6
- sdk: streamlit
7
- sdk_version: 1.31.1
8
- app_file: app.py
9
  pinned: false
10
  ---
11
 
12
  # WER Evaluation Tool
13
 
14
- This Streamlit app provides a user-friendly interface for calculating Word Error Rate (WER) and related metrics between reference and hypothesis texts. It's particularly useful for evaluating speech recognition or machine translation outputs.
15
 
16
  ## Features
17
 
@@ -29,3 +29,82 @@ This Streamlit app provides a user-friendly interface for calculating Word Error
29
  4. Click "Calculate WER" to see results
30
 
31
  ## Local Development
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  emoji: 🎯
4
  colorFrom: blue
5
  colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 5.16.0
8
+ app_file: app_gradio.py
9
  pinned: false
10
  ---
11
 
12
  # WER Evaluation Tool
13
 
14
+ This Gradio app provides a user-friendly interface for calculating Word Error Rate (WER) and related metrics between reference and hypothesis texts. It's particularly useful for evaluating speech recognition or machine translation outputs.
15
 
16
  ## Features
17
 
 
29
  4. Click "Calculate WER" to see results
30
 
31
  ## Local Development
32
+
33
+ 1. Clone the repository:
34
+ ```bash
35
+ git clone https://github.com/yourusername/wer-evaluation-tool.git
36
+ cd wer-evaluation-tool
37
+ ```
38
+
39
+ 2. Create and activate a virtual environment using `uv`:
40
+ ```bash
41
+ uv venv
42
+ source .venv/bin/activate # On Unix/macOS
43
+ # or
44
+ .venv\Scripts\activate # On Windows
45
+ ```
46
+
47
+ 3. Install dependencies:
48
+ ```bash
49
+ uv pip install -r requirements.txt
50
+ ```
51
+
52
+ 4. Run the app locally:
53
+ ```bash
54
+ uv run python app_gradio.py
55
+ ```
56
+
57
+ ## Installation
58
+
59
+ You can install the package directly from PyPI:
60
+
61
+ ```bash
62
+ uv pip install wer-evaluation-tool
63
+ ```
64
+
65
+ ## Testing
66
+
67
+ Run the test suite using pytest:
68
+
69
+ ```bash
70
+ uv run pytest tests/
71
+ ```
72
+
73
+ ## Contributing
74
+
75
+ 1. Fork the repository
76
+ 2. Create a new branch (`git checkout -b feature/improvement`)
77
+ 3. Make your changes
78
+ 4. Run tests to ensure everything works
79
+ 5. Commit your changes (`git commit -am 'Add new feature'`)
80
+ 6. Push to the branch (`git push origin feature/improvement`)
81
+ 7. Create a Pull Request
82
+
83
+ ## License
84
+
85
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
86
+
87
+ ## Acknowledgments
88
+
89
+ - Thanks to all contributors who have helped with the development
90
+ - Inspired by the need for better speech recognition evaluation tools
91
+ - Built with [Gradio](https://gradio.app/)
92
+
93
+ ## Contact
94
+
95
+ For questions or feedback, please:
96
+ - Open an issue in the GitHub repository
97
+ - Contact the maintainers at [email/contact information]
98
+
99
+ ## Citation
100
+
101
+ If you use this tool in your research, please cite:
102
+
103
+ ```bibtex
104
+ @software{wer_evaluation_tool,
105
+ title = {WER Evaluation Tool},
106
+ author = {Your Name},
107
+ year = {2024},
108
+ url = {https://github.com/yourusername/wer-evaluation-tool}
109
+ }
110
+ ```
app.py DELETED
@@ -1,159 +0,0 @@
1
- import streamlit as st
2
- import jiwer
3
- import pandas as pd
4
- from typing import List, Optional
5
-
6
- def calculate_wer_metrics(
7
- hypothesis: str,
8
- reference: str,
9
- normalize: bool = True,
10
- words_to_filter: Optional[List[str]] = None
11
- ) -> dict:
12
- """
13
- Calculate WER metrics between hypothesis and reference texts.
14
-
15
- Args:
16
- hypothesis (str): The hypothesis text
17
- reference (str): The reference text
18
- normalize (bool): Whether to normalize texts before comparison
19
- words_to_filter (List[str], optional): Words to filter out before comparison
20
-
21
- Returns:
22
- dict: Dictionary containing WER metrics
23
- """
24
- # Create transformation pipeline
25
- if normalize:
26
- transformation = jiwer.Compose([
27
- jiwer.ToLowerCase(),
28
- jiwer.RemoveMultipleSpaces(),
29
- jiwer.RemovePunctuation(),
30
- jiwer.Strip()
31
- ])
32
-
33
- # Add custom word filtering if specified
34
- if words_to_filter:
35
- transformation = jiwer.Compose([
36
- transformation,
37
- lambda x: ' '.join(word for word in x.split()
38
- if word.lower() not in [w.lower() for w in words_to_filter])
39
- ])
40
- else:
41
- transformation = None
42
-
43
- # Calculate WER measures
44
- measures = jiwer.compute_measures(
45
- truth=reference,
46
- hypothesis=hypothesis,
47
- truth_transform=transformation,
48
- hypothesis_transform=transformation
49
- )
50
-
51
- return measures
52
-
53
- def main():
54
- st.set_page_config(
55
- page_title="WER Evaluation Tool",
56
- page_icon="🎯",
57
- layout="wide"
58
- )
59
-
60
- st.title("Word Error Rate (WER) Evaluation Tool")
61
- st.markdown("""
62
- This tool helps you evaluate the Word Error Rate (WER) between a reference text and a hypothesis text.
63
- WER is commonly used in speech recognition and machine translation evaluation.
64
- """)
65
-
66
- # Example button
67
- if st.button("Load Example"):
68
- reference = "the quick brown fox jumps over the lazy dog"
69
- hypothesis = "the quick brown fox jumped over lazy dog"
70
- else:
71
- reference = ""
72
- hypothesis = ""
73
-
74
- # Input fields
75
- col1, col2 = st.columns(2)
76
-
77
- with col1:
78
- reference = st.text_area(
79
- "Reference Text",
80
- value=reference,
81
- height=150,
82
- placeholder="Enter the reference text here..."
83
- )
84
-
85
- with col2:
86
- hypothesis = st.text_area(
87
- "Hypothesis Text",
88
- value=hypothesis,
89
- height=150,
90
- placeholder="Enter the hypothesis text here..."
91
- )
92
-
93
- # Options
94
- normalize = st.checkbox("Normalize text (lowercase, remove punctuation)", value=True)
95
-
96
- words_to_filter_input = st.text_input(
97
- "Words to filter (comma-separated)",
98
- placeholder="e.g., um, uh, ah"
99
- )
100
-
101
- words_to_filter = [word.strip() for word in words_to_filter_input.split(",")] if words_to_filter_input else None
102
-
103
- # Calculate button
104
- if st.button("Calculate WER"):
105
- if not reference or not hypothesis:
106
- st.error("Please provide both reference and hypothesis texts.")
107
- return
108
-
109
- try:
110
- measures = calculate_wer_metrics(
111
- hypothesis=hypothesis,
112
- reference=reference,
113
- normalize=normalize,
114
- words_to_filter=words_to_filter
115
- )
116
-
117
- # Display results
118
- col1, col2 = st.columns(2)
119
-
120
- with col1:
121
- st.subheader("Main Metrics")
122
- metrics_df = pd.DataFrame({
123
- 'Metric': ['WER', 'MER', 'WIL', 'WIP'],
124
- 'Value': [
125
- f"{measures['wer']:.3f}",
126
- f"{measures['mer']:.3f}",
127
- f"{measures['wil']:.3f}",
128
- f"{measures['wip']:.3f}"
129
- ]
130
- })
131
- st.table(metrics_df)
132
-
133
- with col2:
134
- st.subheader("Error Analysis")
135
- error_df = pd.DataFrame({
136
- 'Metric': ['Substitutions', 'Deletions', 'Insertions', 'Hits'],
137
- 'Count': [
138
- measures['substitutions'],
139
- measures['deletions'],
140
- measures['insertions'],
141
- measures['hits']
142
- ]
143
- })
144
- st.table(error_df)
145
-
146
- # Add explanations
147
- st.markdown("""
148
- ### Metrics Explanation:
149
- - **WER (Word Error Rate)**: The percentage of words that were incorrectly predicted
150
- - **MER (Match Error Rate)**: The percentage of words that were incorrectly matched
151
- - **WIL (Word Information Lost)**: The percentage of word information that was lost
152
- - **WIP (Word Information Preserved)**: The percentage of word information that was preserved
153
- """)
154
-
155
- except Exception as e:
156
- st.error(f"Error calculating WER: {str(e)}")
157
-
158
- if __name__ == "__main__":
159
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app_gradio.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import jiwer
3
+ import pandas as pd
4
+ import logging
5
+ from typing import List, Optional, Tuple, Dict
6
+
7
+ # Set up logging configuration
8
+ logging.basicConfig(
9
+ level=logging.INFO,
10
+ format='%(asctime)s - %(levelname)s - %(message)s',
11
+ force=True,
12
+ handlers=[
13
+ logging.StreamHandler(),
14
+ ]
15
+ )
16
+ logger = logging.getLogger(__name__)
17
+
18
+ def calculate_wer_metrics(
19
+ hypothesis: str,
20
+ reference: str,
21
+ normalize: bool = True,
22
+ words_to_filter: Optional[List[str]] = None
23
+ ) -> Dict:
24
+ """
25
+ Calculate WER metrics between hypothesis and reference texts.
26
+
27
+ Args:
28
+ hypothesis (str): The hypothesis text
29
+ reference (str): The reference text
30
+ normalize (bool): Whether to normalize texts before comparison
31
+ words_to_filter (List[str], optional): Words to filter out before comparison
32
+
33
+ Returns:
34
+ dict: Dictionary containing WER metrics
35
+
36
+ Raises:
37
+ ValueError: If inputs are invalid or result in empty text after processing
38
+ """
39
+ logger.info(f"Calculating WER metrics with inputs - Hypothesis: {hypothesis}, Reference: {reference}")
40
+
41
+ # Validate inputs
42
+ if not hypothesis.strip() or not reference.strip():
43
+ raise ValueError("Both hypothesis and reference texts must contain non-empty strings")
44
+
45
+ if normalize:
46
+ # Define basic transformations
47
+ basic_transform = jiwer.Compose([
48
+ jiwer.ExpandCommonEnglishContractions(),
49
+ jiwer.ToLowerCase(),
50
+ jiwer.RemoveMultipleSpaces(),
51
+ jiwer.RemovePunctuation(),
52
+ jiwer.Strip(),
53
+ jiwer.ReduceToListOfListOfWords()
54
+ ])
55
+
56
+ if words_to_filter and any(words_to_filter):
57
+ def filter_words_transform(words: List[str]) -> List[str]:
58
+ filtered = [word for word in words
59
+ if word.lower() not in [w.lower() for w in words_to_filter]]
60
+ if not filtered:
61
+ raise ValueError("Text is empty after filtering words")
62
+ return filtered
63
+
64
+ transformation = jiwer.Compose([
65
+ basic_transform,
66
+ filter_words_transform
67
+ ])
68
+ else:
69
+ transformation = basic_transform
70
+
71
+ # Pre-check the transformed text
72
+ try:
73
+ transformed_ref = transformation(reference)
74
+ transformed_hyp = transformation(hypothesis)
75
+ if not transformed_ref or not transformed_hyp:
76
+ raise ValueError("Text is empty after normalization")
77
+ logger.debug(f"Transformed reference: {transformed_ref}")
78
+ logger.debug(f"Transformed hypothesis: {transformed_hyp}")
79
+ except Exception as e:
80
+ logger.error(f"Transformation error: {str(e)}")
81
+ raise ValueError(f"Error during text transformation: {str(e)}")
82
+
83
+ measures = jiwer.compute_measures(
84
+ truth=reference,
85
+ hypothesis=hypothesis,
86
+ truth_transform=transformation,
87
+ hypothesis_transform=transformation
88
+ )
89
+ else:
90
+ measures = jiwer.compute_measures(
91
+ truth=reference,
92
+ hypothesis=hypothesis
93
+ )
94
+
95
+ return measures
96
+
97
+ def process_inputs(
98
+ reference: str,
99
+ hypothesis: str,
100
+ normalize: bool,
101
+ words_to_filter: str
102
+ ) -> Tuple[str, str, str, str]:
103
+ """
104
+ Process inputs and calculate WER metrics.
105
+
106
+ Args:
107
+ reference (str): Reference text
108
+ hypothesis (str): Hypothesis text
109
+ normalize (bool): Whether to normalize text
110
+ words_to_filter (str): Comma-separated words to filter
111
+
112
+ Returns:
113
+ Tuple[str, str, str, str]: HTML formatted main metrics, error analysis,
114
+ and explanations
115
+ """
116
+ if not reference or not hypothesis:
117
+ return "Please provide both reference and hypothesis texts.", "", "", ""
118
+
119
+ try:
120
+ filter_words = [word.strip() for word in words_to_filter.split(",")] if words_to_filter else None
121
+ measures = calculate_wer_metrics(
122
+ hypothesis=hypothesis,
123
+ reference=reference,
124
+ normalize=normalize,
125
+ words_to_filter=filter_words
126
+ )
127
+
128
+ # Format main metrics
129
+ metrics_df = pd.DataFrame({
130
+ 'Metric': ['WER', 'MER', 'WIL', 'WIP'],
131
+ 'Value': [
132
+ f"{measures['wer']:.3f}",
133
+ f"{measures['mer']:.3f}",
134
+ f"{measures['wil']:.3f}",
135
+ f"{measures['wip']:.3f}"
136
+ ]
137
+ })
138
+
139
+ # Format error analysis
140
+ error_df = pd.DataFrame({
141
+ 'Metric': ['Substitutions', 'Deletions', 'Insertions', 'Hits'],
142
+ 'Count': [
143
+ measures['substitutions'],
144
+ measures['deletions'],
145
+ measures['insertions'],
146
+ measures['hits']
147
+ ]
148
+ })
149
+
150
+ metrics_html = metrics_df.to_html(index=False)
151
+ error_html = error_df.to_html(index=False)
152
+
153
+ explanation = """
154
+ <h3>Metrics Explanation:</h3>
155
+ <ul>
156
+ <li><b>WER (Word Error Rate)</b>: The percentage of words that were incorrectly predicted</li>
157
+ <li><b>MER (Match Error Rate)</b>: The percentage of words that were incorrectly matched</li>
158
+ <li><b>WIL (Word Information Lost)</b>: The percentage of word information that was lost</li>
159
+ <li><b>WIP (Word Information Preserved)</b>: The percentage of word information that was preserved</li>
160
+ </ul>
161
+ """
162
+
163
+ return metrics_html, error_html, explanation, ""
164
+
165
+ except Exception as e:
166
+ error_msg = f"Error calculating WER: {str(e)}"
167
+ logger.error(error_msg)
168
+ return "", "", "", error_msg
169
+
170
+ def load_example() -> Tuple[str, str]:
171
+ """Load example texts for demonstration."""
172
+ return (
173
+ "the quick brown fox jumps over the lazy dog",
174
+ "the quick brown fox jumped over lazy dog"
175
+ )
176
+
177
+ def create_interface() -> gr.Blocks:
178
+ """Create the Gradio interface."""
179
+ with gr.Blocks(title="WER Evaluation Tool") as interface:
180
+ gr.Markdown("# Word Error Rate (WER) Evaluation Tool")
181
+ gr.Markdown(
182
+ "This tool helps you evaluate the Word Error Rate (WER) between a reference "
183
+ "text and a hypothesis text. WER is commonly used in speech recognition and "
184
+ "machine translation evaluation."
185
+ )
186
+
187
+ with gr.Row():
188
+ with gr.Column():
189
+ reference = gr.Textbox(
190
+ label="Reference Text",
191
+ placeholder="Enter the reference text here...",
192
+ lines=5
193
+ )
194
+ with gr.Column():
195
+ hypothesis = gr.Textbox(
196
+ label="Hypothesis Text",
197
+ placeholder="Enter the hypothesis text here...",
198
+ lines=5
199
+ )
200
+
201
+ with gr.Row():
202
+ normalize = gr.Checkbox(
203
+ label="Normalize text (lowercase, remove punctuation)",
204
+ value=True
205
+ )
206
+ words_to_filter = gr.Textbox(
207
+ label="Words to filter (comma-separated)",
208
+ placeholder="e.g., um, uh, ah"
209
+ )
210
+
211
+ with gr.Row():
212
+ example_btn = gr.Button("Load Example")
213
+ calculate_btn = gr.Button("Calculate WER", variant="primary")
214
+
215
+ with gr.Row():
216
+ metrics_output = gr.HTML(label="Main Metrics")
217
+ error_output = gr.HTML(label="Error Analysis")
218
+
219
+ explanation_output = gr.HTML()
220
+ error_msg_output = gr.HTML()
221
+
222
+ # Event handlers
223
+ example_btn.click(
224
+ load_example,
225
+ outputs=[reference, hypothesis]
226
+ )
227
+
228
+ calculate_btn.click(
229
+ process_inputs,
230
+ inputs=[reference, hypothesis, normalize, words_to_filter],
231
+ outputs=[metrics_output, error_output, explanation_output, error_msg_output]
232
+ )
233
+
234
+ return interface
235
+
236
+ if __name__ == "__main__":
237
+ logger.info("Application started")
238
+ app = create_interface()
239
+ app.launch()
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
- streamlit==1.31.1
2
  jiwer==3.1.0
3
  pandas==2.2.0
 
1
+ gradio==5.16.0
2
  jiwer==3.1.0
3
  pandas==2.2.0