uzagi commited on
Commit
cb3e494
·
1 Parent(s): ff685cc

add phoneme

Browse files
Files changed (6) hide show
  1. Dockerfile +1 -1
  2. __pycache__/app.cpython-312.pyc +0 -0
  3. app.py +7 -1
  4. model.py +13 -0
  5. phoneme.py +152 -0
  6. requirements.txt +6 -1
Dockerfile CHANGED
@@ -10,4 +10,4 @@ COPY --chown=user ./requirements.txt requirements.txt
10
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
 
12
  COPY --chown=user . /app
13
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
10
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
 
12
  COPY --chown=user . /app
13
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--reload"]
__pycache__/app.cpython-312.pyc ADDED
Binary file (393 Bytes). View file
 
app.py CHANGED
@@ -1,7 +1,13 @@
1
  from fastapi import FastAPI
2
 
 
 
3
  app = FastAPI()
4
 
5
  @app.get("/")
6
  def greet_json():
7
- return {"Hello": "World!"}
 
 
 
 
 
1
  from fastapi import FastAPI
2
 
3
+ from phoneme import test_sound
4
+
5
  app = FastAPI()
6
 
7
  @app.get("/")
8
  def greet_json():
9
+ return {"Hello": "World!", "Eat": "Cat"}
10
+
11
+ @app.post("/phoneme-scoring")
12
+ def scoring(input_text, audio):
13
+ test_sound()
model.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+
3
+ class ResponseData(BaseModel):
4
+ text: str
5
+
6
+ class PhonemeRequest(BaseModel):
7
+ transcript: str
8
+ audio: str
9
+
10
+ class PhonemeResponse(BaseModel):
11
+ code: int
12
+ message: str
13
+ data: {}
phoneme.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import torch
4
+ from transformers import AutoProcessor, AutoModelForCTC, Wav2Vec2PhonemeCTCTokenizer
5
+ import librosa
6
+ from itertools import groupby
7
+ from datasets import load_dataset
8
+ from phonemizer import phonemize
9
+ from phonemizer.backend.espeak.wrapper import EspeakWrapper
10
+
11
+ # PHONEMIZER_ESPEAK_LIBRARY="c:\Program Files\eSpeak NG\libespeak-ng.dll"
12
+ # PHONEMIZER_ESPEAK_PATH="c:\Program Files\eSpeak NG"
13
+ # ESPEAK_PATH = os.getenv("PHONEMIZER_ESPEAK_LIBRARY")
14
+ # if ESPEAK_PATH is not None:
15
+ # EspeakWrapper.set_library(ESPEAK_PATH)
16
+ # print(f"Loaded environment variables PHONEMIZER_ESPEAK_LIBRARY: {ESPEAK_PATH}")
17
+ # print(f"Using espeak library: {EspeakWrapper.library_path}")
18
+
19
+
20
+ # Load the model and processor
21
+ # checkpoint = "bookbot/wav2vec2-ljspeech-gruut"
22
+ checkpoint = "facebook/wav2vec2-lv-60-espeak-cv-ft"
23
+ model = AutoModelForCTC.from_pretrained(checkpoint)
24
+ processor = AutoProcessor.from_pretrained(checkpoint)
25
+ tokenizer = Wav2Vec2PhonemeCTCTokenizer.from_pretrained(checkpoint)
26
+ sr = processor.feature_extractor.sampling_rate
27
+
28
+
29
+ def decode_phonemes(
30
+ ids: torch.Tensor, processor: AutoProcessor, ignore_stress: bool = False
31
+ ) -> str:
32
+ """CTC-like decoding. First removes consecutive duplicates, then removes special tokens."""
33
+ # Remove consecutive duplicates
34
+ ids = [id_ for id_, _ in groupby(ids)]
35
+
36
+ special_token_ids = processor.tokenizer.all_special_ids + [
37
+ processor.tokenizer.word_delimiter_token_id
38
+ ]
39
+ # Convert id to token, skipping special tokens
40
+ phonemes = [processor.decode(id_) for id_ in ids if id_ not in special_token_ids]
41
+
42
+ # Join phonemes
43
+ prediction = " ".join(phonemes)
44
+
45
+ # Ignore IPA stress marks if specified
46
+ if ignore_stress:
47
+ prediction = prediction.replace("ˈ", "").replace("ˌ", "")
48
+
49
+ return prediction
50
+
51
+
52
+ def text_to_phonemes(text: str) -> str:
53
+ s_time = time.time()
54
+ """Convert text to phonemes using phonemizer."""
55
+ # phonemes = phonemize(text, language="en-us", backend="espeak", strip=True)
56
+ phonemes = tokenizer.phonemize(text, phonemizer_lang="en-us")
57
+ e_time = time.time()
58
+ print(f"Execution time of text_to_phonemes: {e_time - s_time:.6f} seconds")
59
+ return phonemes
60
+
61
+
62
+ def text_to_phonemes_2(text: str) -> str:
63
+ s_time = time.time()
64
+ """Convert text to phonemes using phonemizer."""
65
+ phonemes = phonemize(text, language="en-us", backend="espeak", strip=True)
66
+ # phonemes = tokenizer.phonemize(text)
67
+ e_time = time.time()
68
+ print(f"Execution time of text_to_phonemes_2: {e_time - s_time:.6f} seconds")
69
+ return phonemes
70
+
71
+
72
+ def separate_characters(input_string):
73
+ no_spaces = input_string.replace(" ", "")
74
+ spaced_string = " ".join(no_spaces)
75
+ return spaced_string
76
+
77
+
78
+ def predict_phonemes(audio_array):
79
+ # Load audio file and preprocess
80
+ # audio_array, _ = librosa.load(audio_path, sr=sr)
81
+
82
+ inputs = processor(audio_array, return_tensors="pt", padding=True)
83
+
84
+ # Perform inference
85
+ with torch.no_grad():
86
+ logits = model(inputs["input_values"]).logits
87
+
88
+ # Decode the predicted phonemes
89
+ predicted_ids = torch.argmax(logits, dim=-1)
90
+ predicted_phonemes = decode_phonemes(
91
+ predicted_ids[0], processor, ignore_stress=True
92
+ )
93
+
94
+ return predicted_phonemes # Return the predicted phonemes
95
+
96
+
97
+ def adjust_phonemes(predicted: str) -> str:
98
+ # Replace specific phonemes or patterns as needed
99
+ # adjusted = predicted.replace(" ə ", " ") # Remove schwa if it appears alone
100
+ adjusted = predicted.replace(" ", " ") # Remove double spaces
101
+ adjusted = adjusted.strip() # Trim leading/trailing spaces
102
+ return adjusted
103
+
104
+
105
+ def calculate_score(expected: str, predicted: str) -> float:
106
+ expected_list = expected.split()
107
+ predicted_list = predicted.split()
108
+
109
+ # Calculate the number of correct matches
110
+ correct_matches = sum(1 for e, p in zip(expected_list, predicted_list) if e == p)
111
+
112
+ # Calculate the score as the ratio of correct matches to expected phonemes
113
+ score = correct_matches / len(expected_list) if expected_list else 0
114
+ return score
115
+
116
+
117
+ def test_sound():
118
+ start_time = time.time()
119
+
120
+ ds = load_dataset(
121
+ "patrickvonplaten/librispeech_asr_dummy",
122
+ "clean",
123
+ split="validation",
124
+ trust_remote_code=True,
125
+ )
126
+ audio_array = ds[0]["audio"]["array"]
127
+
128
+ text = ds[0]["text"]
129
+ # audio_path = "hello.wav"
130
+ # text = "Hello"
131
+ expected_transcript = text # Expected transcript
132
+ expected_phonemes = text_to_phonemes(text) # Expected phonemes for "Hello"
133
+ expected_phonemes = separate_characters(expected_phonemes)
134
+ # Call the phoneme prediction function
135
+ predicted_phonemes = predict_phonemes(audio_array)
136
+ adjusted_phonemes = adjust_phonemes(predicted_phonemes)
137
+
138
+ # expected_phonemes_2 = text_to_phonemes_2(expected_transcript)
139
+ print(f"Expected Phonemes: {expected_phonemes}")
140
+ # print(f"Expected Phonemes 2: {expected_phonemes_2}")
141
+ print(f"Predicted Phonemes: {predicted_phonemes}")
142
+ print(f"Adjusted Phonemes: {adjusted_phonemes}")
143
+
144
+ # Calculate score based on expected and predicted phonemes
145
+ score = calculate_score(expected_phonemes, adjusted_phonemes)
146
+
147
+ # Prepare the output
148
+ text = f"Transcript: {expected_transcript}\nExpected Phonemes: {expected_phonemes}\nPredicted Phonemes: {predicted_phonemes}\nAdjusted Phonemes: {adjusted_phonemes}\nScore: {score:.2f}"
149
+ end_time = time.time()
150
+ execution_time = end_time - start_time
151
+ print(f"Execution time: {execution_time:.6f} seconds")
152
+ return {"text": text}
requirements.txt CHANGED
@@ -1,2 +1,7 @@
1
  fastapi
2
- uvicorn[standard]
 
 
 
 
 
 
1
  fastapi
2
+ uvicorn[standard]
3
+ torch
4
+ transformers
5
+ librosa
6
+ phonemizer
7
+ datasets