sivakorn-su commited on
Commit
8d5bb0a
·
1 Parent(s): 67ca65d

fix docker file and add nlp

Browse files
Files changed (3) hide show
  1. Dockerfile +4 -3
  2. requirements.txt +1 -1
  3. utils.py +10 -8
Dockerfile CHANGED
@@ -27,8 +27,8 @@ RUN ln -fs /usr/share/zoneinfo/Asia/Bangkok /etc/localtime && \
27
  dpkg-reconfigure -f noninteractive tzdata
28
 
29
  # สร้าง directory cache ต่าง ๆ
30
- RUN mkdir -p /tmp/hf_cache /tmp/torch_cache /tmp/matplotlib /tmp/xdg_cache /tmp/home /tmp/uploads \
31
- && chmod -R 777 /tmp/hf_cache /tmp/torch_cache /tmp/matplotlib /tmp/xdg_cache /tmp/home /tmp/uploads
32
 
33
  # เพิ่ม PATH สำหรับ cuDNN 9 ให้เจอ .so
34
  ENV HUGGINGFACE_HUB_CACHE=/tmp/hf_cache \
@@ -42,7 +42,8 @@ ENV HUGGINGFACE_HUB_CACHE=/tmp/hf_cache \
42
  TMPDIR=/tmp \
43
  TEMP=/tmp \
44
  TMP=/tmp \
45
- LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
 
46
 
47
  # ติดตั้ง Python dependencies
48
  COPY requirements.txt .
 
27
  dpkg-reconfigure -f noninteractive tzdata
28
 
29
  # สร้าง directory cache ต่าง ๆ
30
+ RUN mkdir -p /tmp/hf_cache /tmp/torch_cache /tmp/matplotlib /tmp/xdg_cache /tmp/home /tmp/uploads /tmp/pythainlp_data \
31
+ && chmod -R 777 /tmp/hf_cache /tmp/torch_cache /tmp/matplotlib /tmp/xdg_cache /tmp/home /tmp/uploads /tmp/pythainlp_data
32
 
33
  # เพิ่ม PATH สำหรับ cuDNN 9 ให้เจอ .so
34
  ENV HUGGINGFACE_HUB_CACHE=/tmp/hf_cache \
 
42
  TMPDIR=/tmp \
43
  TEMP=/tmp \
44
  TMP=/tmp \
45
+ LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH \
46
+ PYTHAINLP_DATA_DIR=/tmp/pythainlp_data
47
 
48
  # ติดตั้ง Python dependencies
49
  COPY requirements.txt .
requirements.txt CHANGED
@@ -28,7 +28,7 @@ pandas==2.1.4
28
  numpy==1.24.4
29
  omegaconf==2.3.0
30
  pyyaml==6.0.1
31
-
32
  # Utilities
33
  nest_asyncio==1.5.8
34
  python-dotenv==1.0.0
 
28
  numpy==1.24.4
29
  omegaconf==2.3.0
30
  pyyaml==6.0.1
31
+ pythainlp==5.1.2
32
  # Utilities
33
  nest_asyncio==1.5.8
34
  python-dotenv==1.0.0
utils.py CHANGED
@@ -12,6 +12,7 @@ from collections import Counter
12
  import time
13
  from config import UPLOAD_FOLDER
14
  from models import pipelines, models, together
 
15
 
16
  def save_uploaded_file(file: UploadFile) -> str:
17
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
@@ -20,6 +21,11 @@ def save_uploaded_file(file: UploadFile) -> str:
20
  shutil.copyfileobj(file.file, f)
21
  return filepath
22
 
 
 
 
 
 
23
  def extract_and_normalize_audio(file_path: str) -> str:
24
  ext = os.path.splitext(file_path)[1].lower()
25
  audio_path = os.path.join(UPLOAD_FOLDER, "extracted_audio.wav")
@@ -81,25 +87,22 @@ def transcribe_segments(segment_folder: str) -> pd.DataFrame:
81
 
82
  if words:
83
  full_text = ''.join([w.word for w in words])
 
84
  probs = [w.probability for w in words if w.probability is not None]
85
  avg_prob = round(np.mean(probs), 4) if probs else 0.0
86
- start_time = round(min(w.start for w in words if w.start is not None), 2)
87
- end_time = round(max(w.end for w in words if w.end is not None), 2)
88
 
89
  results.append({
90
  "filename": filename,
91
  "text": full_text,
 
92
  "avg_probability": avg_prob,
93
- "start": start_time,
94
- "end": end_time
95
  })
96
  else:
97
  results.append({
98
  "filename": filename,
99
  "text": "",
 
100
  "avg_probability": 0.0,
101
- "start": 0.0,
102
- "end": 0.0
103
  })
104
 
105
  except Exception as e:
@@ -107,9 +110,8 @@ def transcribe_segments(segment_folder: str) -> pd.DataFrame:
107
  results.append({
108
  "filename": filename,
109
  "text": "",
 
110
  "avg_probability": 0.0,
111
- "start": 0.0,
112
- "end": 0.0,
113
  "error": str(e)
114
  })
115
 
 
12
  import time
13
  from config import UPLOAD_FOLDER
14
  from models import pipelines, models, together
15
+ from pythainlp.spell import correct_sent
16
 
17
  def save_uploaded_file(file: UploadFile) -> str:
18
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 
21
  shutil.copyfileobj(file.file, f)
22
  return filepath
23
 
24
+ def correct_text_with_tokenizer(text: str) -> str:
25
+ tokens = word_tokenize(text, engine="newmm")
26
+ corrected_tokens = [correct(word) for word in tokens]
27
+ return ''.join(corrected_tokens)
28
+
29
  def extract_and_normalize_audio(file_path: str) -> str:
30
  ext = os.path.splitext(file_path)[1].lower()
31
  audio_path = os.path.join(UPLOAD_FOLDER, "extracted_audio.wav")
 
87
 
88
  if words:
89
  full_text = ''.join([w.word for w in words])
90
+ nlp_corrected_text = correct_sent(full_text)
91
  probs = [w.probability for w in words if w.probability is not None]
92
  avg_prob = round(np.mean(probs), 4) if probs else 0.0
 
 
93
 
94
  results.append({
95
  "filename": filename,
96
  "text": full_text,
97
+ "nlp_corrected_text":nlp_corrected_text,
98
  "avg_probability": avg_prob,
 
 
99
  })
100
  else:
101
  results.append({
102
  "filename": filename,
103
  "text": "",
104
+ "nlp_corrected_text":"",
105
  "avg_probability": 0.0,
 
 
106
  })
107
 
108
  except Exception as e:
 
110
  results.append({
111
  "filename": filename,
112
  "text": "",
113
+ "nlp_corrected_text":"",
114
  "avg_probability": 0.0,
 
 
115
  "error": str(e)
116
  })
117