Spaces:
Paused
Paused
sivakorn-su
commited on
Commit
·
8d5bb0a
1
Parent(s):
67ca65d
fix docker file and add nlp
Browse files- Dockerfile +4 -3
- requirements.txt +1 -1
- utils.py +10 -8
Dockerfile
CHANGED
|
@@ -27,8 +27,8 @@ RUN ln -fs /usr/share/zoneinfo/Asia/Bangkok /etc/localtime && \
|
|
| 27 |
dpkg-reconfigure -f noninteractive tzdata
|
| 28 |
|
| 29 |
# สร้าง directory cache ต่าง ๆ
|
| 30 |
-
RUN mkdir -p /tmp/hf_cache /tmp/torch_cache /tmp/matplotlib /tmp/xdg_cache /tmp/home /tmp/uploads \
|
| 31 |
-
&& chmod -R 777 /tmp/hf_cache /tmp/torch_cache /tmp/matplotlib /tmp/xdg_cache /tmp/home /tmp/uploads
|
| 32 |
|
| 33 |
# เพิ่ม PATH สำหรับ cuDNN 9 ให้เจอ .so
|
| 34 |
ENV HUGGINGFACE_HUB_CACHE=/tmp/hf_cache \
|
|
@@ -42,7 +42,8 @@ ENV HUGGINGFACE_HUB_CACHE=/tmp/hf_cache \
|
|
| 42 |
TMPDIR=/tmp \
|
| 43 |
TEMP=/tmp \
|
| 44 |
TMP=/tmp \
|
| 45 |
-
LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
|
|
|
|
| 46 |
|
| 47 |
# ติดตั้ง Python dependencies
|
| 48 |
COPY requirements.txt .
|
|
|
|
| 27 |
dpkg-reconfigure -f noninteractive tzdata
|
| 28 |
|
| 29 |
# สร้าง directory cache ต่าง ๆ
|
| 30 |
+
RUN mkdir -p /tmp/hf_cache /tmp/torch_cache /tmp/matplotlib /tmp/xdg_cache /tmp/home /tmp/uploads /tmp/pythainlp_data \
|
| 31 |
+
&& chmod -R 777 /tmp/hf_cache /tmp/torch_cache /tmp/matplotlib /tmp/xdg_cache /tmp/home /tmp/uploads /tmp/pythainlp_data
|
| 32 |
|
| 33 |
# เพิ่ม PATH สำหรับ cuDNN 9 ให้เจอ .so
|
| 34 |
ENV HUGGINGFACE_HUB_CACHE=/tmp/hf_cache \
|
|
|
|
| 42 |
TMPDIR=/tmp \
|
| 43 |
TEMP=/tmp \
|
| 44 |
TMP=/tmp \
|
| 45 |
+
LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH \
|
| 46 |
+
PYTHAINLP_DATA_DIR=/tmp/pythainlp_data
|
| 47 |
|
| 48 |
# ติดตั้ง Python dependencies
|
| 49 |
COPY requirements.txt .
|
requirements.txt
CHANGED
|
@@ -28,7 +28,7 @@ pandas==2.1.4
|
|
| 28 |
numpy==1.24.4
|
| 29 |
omegaconf==2.3.0
|
| 30 |
pyyaml==6.0.1
|
| 31 |
-
|
| 32 |
# Utilities
|
| 33 |
nest_asyncio==1.5.8
|
| 34 |
python-dotenv==1.0.0
|
|
|
|
| 28 |
numpy==1.24.4
|
| 29 |
omegaconf==2.3.0
|
| 30 |
pyyaml==6.0.1
|
| 31 |
+
pythainlp==5.1.2
|
| 32 |
# Utilities
|
| 33 |
nest_asyncio==1.5.8
|
| 34 |
python-dotenv==1.0.0
|
utils.py
CHANGED
|
@@ -12,6 +12,7 @@ from collections import Counter
|
|
| 12 |
import time
|
| 13 |
from config import UPLOAD_FOLDER
|
| 14 |
from models import pipelines, models, together
|
|
|
|
| 15 |
|
| 16 |
def save_uploaded_file(file: UploadFile) -> str:
|
| 17 |
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
|
@@ -20,6 +21,11 @@ def save_uploaded_file(file: UploadFile) -> str:
|
|
| 20 |
shutil.copyfileobj(file.file, f)
|
| 21 |
return filepath
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
def extract_and_normalize_audio(file_path: str) -> str:
|
| 24 |
ext = os.path.splitext(file_path)[1].lower()
|
| 25 |
audio_path = os.path.join(UPLOAD_FOLDER, "extracted_audio.wav")
|
|
@@ -81,25 +87,22 @@ def transcribe_segments(segment_folder: str) -> pd.DataFrame:
|
|
| 81 |
|
| 82 |
if words:
|
| 83 |
full_text = ''.join([w.word for w in words])
|
|
|
|
| 84 |
probs = [w.probability for w in words if w.probability is not None]
|
| 85 |
avg_prob = round(np.mean(probs), 4) if probs else 0.0
|
| 86 |
-
start_time = round(min(w.start for w in words if w.start is not None), 2)
|
| 87 |
-
end_time = round(max(w.end for w in words if w.end is not None), 2)
|
| 88 |
|
| 89 |
results.append({
|
| 90 |
"filename": filename,
|
| 91 |
"text": full_text,
|
|
|
|
| 92 |
"avg_probability": avg_prob,
|
| 93 |
-
"start": start_time,
|
| 94 |
-
"end": end_time
|
| 95 |
})
|
| 96 |
else:
|
| 97 |
results.append({
|
| 98 |
"filename": filename,
|
| 99 |
"text": "",
|
|
|
|
| 100 |
"avg_probability": 0.0,
|
| 101 |
-
"start": 0.0,
|
| 102 |
-
"end": 0.0
|
| 103 |
})
|
| 104 |
|
| 105 |
except Exception as e:
|
|
@@ -107,9 +110,8 @@ def transcribe_segments(segment_folder: str) -> pd.DataFrame:
|
|
| 107 |
results.append({
|
| 108 |
"filename": filename,
|
| 109 |
"text": "",
|
|
|
|
| 110 |
"avg_probability": 0.0,
|
| 111 |
-
"start": 0.0,
|
| 112 |
-
"end": 0.0,
|
| 113 |
"error": str(e)
|
| 114 |
})
|
| 115 |
|
|
|
|
| 12 |
import time
|
| 13 |
from config import UPLOAD_FOLDER
|
| 14 |
from models import pipelines, models, together
|
| 15 |
+
from pythainlp.spell import correct_sent
|
| 16 |
|
| 17 |
def save_uploaded_file(file: UploadFile) -> str:
|
| 18 |
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
|
|
|
| 21 |
shutil.copyfileobj(file.file, f)
|
| 22 |
return filepath
|
| 23 |
|
| 24 |
+
def correct_text_with_tokenizer(text: str) -> str:
|
| 25 |
+
tokens = word_tokenize(text, engine="newmm")
|
| 26 |
+
corrected_tokens = [correct(word) for word in tokens]
|
| 27 |
+
return ''.join(corrected_tokens)
|
| 28 |
+
|
| 29 |
def extract_and_normalize_audio(file_path: str) -> str:
|
| 30 |
ext = os.path.splitext(file_path)[1].lower()
|
| 31 |
audio_path = os.path.join(UPLOAD_FOLDER, "extracted_audio.wav")
|
|
|
|
| 87 |
|
| 88 |
if words:
|
| 89 |
full_text = ''.join([w.word for w in words])
|
| 90 |
+
nlp_corrected_text = correct_sent(full_text)
|
| 91 |
probs = [w.probability for w in words if w.probability is not None]
|
| 92 |
avg_prob = round(np.mean(probs), 4) if probs else 0.0
|
|
|
|
|
|
|
| 93 |
|
| 94 |
results.append({
|
| 95 |
"filename": filename,
|
| 96 |
"text": full_text,
|
| 97 |
+
"nlp_corrected_text":nlp_corrected_text,
|
| 98 |
"avg_probability": avg_prob,
|
|
|
|
|
|
|
| 99 |
})
|
| 100 |
else:
|
| 101 |
results.append({
|
| 102 |
"filename": filename,
|
| 103 |
"text": "",
|
| 104 |
+
"nlp_corrected_text":"",
|
| 105 |
"avg_probability": 0.0,
|
|
|
|
|
|
|
| 106 |
})
|
| 107 |
|
| 108 |
except Exception as e:
|
|
|
|
| 110 |
results.append({
|
| 111 |
"filename": filename,
|
| 112 |
"text": "",
|
| 113 |
+
"nlp_corrected_text":"",
|
| 114 |
"avg_probability": 0.0,
|
|
|
|
|
|
|
| 115 |
"error": str(e)
|
| 116 |
})
|
| 117 |
|