Jayem-11 commited on
Commit
1f1d328
·
1 Parent(s): b35add0

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +20 -0
  2. main.py +64 -0
  3. requirements.txt +10 -0
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # you will also find guides on how best to write your Dockerfile
2
+
3
+ FROM python:3.9
4
+
5
+ WORKDIR /code
6
+
7
+ COPY ./requirements.txt /code/requirements.txt
8
+
9
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
10
+
11
+ RUN useradd -m -u 1000 user
12
+ USER user
13
+ ENV HOME=/home/user \
14
+ PATH=/home/user/.local/bin:$PATH
15
+
16
+ WORKDIR $HOME/app
17
+
18
+ COPY --chown=user . $HOME/app
19
+
20
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile
2
+ from moviepy.editor import *
3
+ from transformers import AutoTokenizer , AutoModelForSeq2SeqLM , pipeline
4
+ from transformers import WhisperForConditionalGeneration, WhisperProcessor
5
+ from transformers import WhisperFeatureExtractor, WhisperTokenizer
6
+ import librosa
7
+ import numpy as np
8
+ import torch
9
+
10
+
11
+
12
+ app = FastAPI()
13
+
14
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+ device
16
+
17
+ feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
18
+ tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Swahili", task="transcribe")
19
+ processor = WhisperProcessor.from_pretrained("Jayem-11/whisper-small-swahili-3")
20
+ asr_model = WhisperForConditionalGeneration.from_pretrained('Jayem-11/whisper-small-swahili-3')
21
+ forced_decoder_ids = processor.get_decoder_prompt_ids(language="sw", task="transcribe")
22
+
23
+
24
+ @app.get("/")
25
+ async def read_root():
26
+ return {"Successful"}
27
+
28
+
29
+ def extract_and_resample_audio(file):
30
+
31
+ with open('audio.wav', 'wb') as f:
32
+
33
+ f.write(file)
34
+
35
+ # Load the temporary audio file
36
+ audio_data, sr = librosa.load("audio.wav")
37
+
38
+ # Resample the audio to 16000Hz
39
+ audio_resampled = librosa.resample(audio_data, orig_sr = sr, target_sr=16000)
40
+
41
+ print("Done resampling")
42
+
43
+ return audio_resampled
44
+
45
+ @app.post("/predict")
46
+ async def predict(file: UploadFile):
47
+ audio_resampled = extract_and_resample_audio(await file.read())
48
+
49
+
50
+ input_feats = feature_extractor(audio_resampled, sampling_rate = 16000).input_features[0]
51
+
52
+
53
+ input_feats = np.expand_dims(input_feats, axis=0)
54
+
55
+
56
+ input_feats = torch.from_numpy(input_feats)
57
+
58
+
59
+ output = asr_model.generate(input_features=input_feats.to(device),max_new_tokens=255,).cpu().numpy()
60
+
61
+
62
+ sample_text = tokenizer.batch_decode(output, skip_special_tokens=True)
63
+
64
+ return {'Text': sample_text}
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ transformers
4
+ moviepy
5
+ librosa
6
+ numpy
7
+ torch
8
+ python-multipart
9
+ sentencepiece
10
+ protobuf