Spaces:

abdullahalmunem
/

Bangla_Sentence_Punctuation_Restoration

Sleeping

App Files Files Community

abdullahalmunem commited on Jul 27

Commit

f81cfe2

1 Parent(s): ad3fa03

model added

Browse files

Files changed (12) hide show

Dockerfile +18 -0
api_onnx.py +85 -0
app.py +406 -0
config.py +75 -0
distilbert-base-multilingual-cased/config.json +22 -0
distilbert-base-multilingual-cased/tokenizer.json +0 -0
distilbert-base-multilingual-cased/tokenizer_config.json +1 -0
distilbert-base-multilingual-cased/vocab.txt +0 -0
entrypoint.sh +10 -0
inference_onnx.py +172 -0
poc_onnx_model_punctuation_batch.onnx +3 -0
requirements.txt +9 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+FROM pytorch/pytorch:latest
+WORKDIR /app
+# Install dependencies
+RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy your source code
+COPY . .
+# Expose port 7860 (Hugging Face Spaces default)
+EXPOSE 7860
+# Run both API and Gradio
+CMD ["bash", "entrypoint.sh"]

api_onnx.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import os
+import re
+from fastapi import FastAPI, Request
+from pydantic import BaseModel
+from inference_onnx import get_transcription
+import torch
+import onnxruntime as ort
+from config import *
+from contextlib import asynccontextmanager
+# Global session object (attached to app.state)
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    print("🔧 Loading model...")
+    app.state.device = torch.device('cpu')
+    app.state.tokenizer = MODELS["./distilbert-base-multilingual-cased"][1].from_pretrained("./distilbert-base-multilingual-cased")
+    app.state.token_style = MODELS["./distilbert-base-multilingual-cased"][3]
+    onnx_model_path = "./poc_onnx_model_punctuation_batch.onnx"
+    providers = ['CPUExecutionProvider']
+    # providers = ["CUDAExecutionProvider"]
+    # providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
+    sess_options = ort.SessionOptions()
+    app.state.session = ort.InferenceSession(onnx_model_path, providers=providers)
+    print("✅ ONNX model loaded into memory.")
+    yield
+    print("🧹 Shutting down...")
+app = FastAPI(lifespan=lifespan)
+punc_dict = {
+    '!': 'EXCLAMATION',
+    '?': 'QUESTION',
+    ',': 'COMMA',
+    ';': 'SEMICOLON',
+    ':': 'COLON',
+    '-': 'HYPHEN',
+    '।': 'DARI',
+}
+allowed_punctuations = set(punc_dict.keys())
+def clean_and_normalize_text(text, remove_punctuations=False):
+    """Clean and normalize Bangla text with correct spacing"""
+    if remove_punctuations:
+        # Remove all allowed punctuations
+        cleaned_text = re.sub(f"[{re.escape(''.join(allowed_punctuations))}]", "", text)
+        # Normalize spaces
+        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
+        return cleaned_text
+    else:
+        # Keep only allowed punctuations and Bangla letters/digits
+        chunks = re.split(f"([{re.escape(''.join(allowed_punctuations))}])", text)
+        filtered_chunks = []
+        for chunk in chunks:
+            if chunk in allowed_punctuations:
+                filtered_chunks.append(chunk)
+            else:
+                # Clean text and preserve word boundaries
+                clean_chunk = re.sub(rf"[^\u0980-\u09FF\u09E6-\u09EF\s]", "", chunk)
+                clean_chunk = re.sub(r'\s+', ' ', clean_chunk)  # Normalize internal spacing
+                clean_chunk = clean_chunk.strip()
+                if clean_chunk:
+                    filtered_chunks.append(' ' + clean_chunk)  # Add space before word chunks
+        # Join and clean up spacing
+        result = ''.join(filtered_chunks)
+        result = re.sub(r'\s+', ' ', result).strip()
+        return result
+class TextInput(BaseModel):
+    text: str
+@app.post("/punctuate")
+async def punctuate_text(data: TextInput):
+    input_normalized = clean_and_normalize_text(data.text)
+    input_normalized = clean_and_normalize_text(input_normalized, remove_punctuations=True)
+    restored_text = get_transcription(input_normalized, app.state.session, app.state.tokenizer, app.state.device, app.state.token_style)
+    return {"restored_text": restored_text}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("api_onnx:app", host="0.0.0.0", port=5685, workers=1)

app.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import os
+import gradio as gr
+import requests
+import re
+import time
+import pandas as pd
+from typing import Dict, Tuple, List, Optional
+# Configuration
+API_URL = "http://localhost:5685/punctuate"
+punc_dict = {
+    '!': 'EXCLAMATION',
+    '?': 'QUESTION',
+    ',': 'COMMA',
+    ';': 'SEMICOLON',
+    ':': 'COLON',
+    '-': 'HYPHEN',
+    '।': 'DARI',
+}
+allowed_punctuations = set(punc_dict.keys())
+def clean_and_normalize_text(text, remove_punctuations=False):
+    """Clean and normalize Bangla text with correct spacing"""
+    if remove_punctuations:
+        # Remove all allowed punctuations
+        cleaned_text = re.sub(f"[{re.escape(''.join(allowed_punctuations))}]", "", text)
+        # Normalize spaces
+        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
+        return cleaned_text
+    else:
+        # Keep only allowed punctuations and Bangla letters/digits
+        chunks = re.split(f"([{re.escape(''.join(allowed_punctuations))}])", text)
+        filtered_chunks = []
+        for chunk in chunks:
+            if chunk in allowed_punctuations:
+                filtered_chunks.append(chunk)
+            else:
+                # Clean text and preserve word boundaries
+                clean_chunk = re.sub(rf"[^\u0980-\u09FF\u09E6-\u09EF\s]", "", chunk)
+                clean_chunk = re.sub(r'\s+', ' ', clean_chunk)  # Normalize internal spacing
+                clean_chunk = clean_chunk.strip()
+                if clean_chunk:
+                    filtered_chunks.append(' ' + clean_chunk)  # Add space before word chunks
+        # Join and clean up spacing
+        result = ''.join(filtered_chunks)
+        result = re.sub(r'\s+', ' ', result).strip()
+        return result
+def restore_punctuation(text):
+    """Call the punctuation restoration API"""
+    try:
+        payload = {"text": text}
+        start_time = time.time()
+        response = requests.post(API_URL, json=payload)
+        end_time = time.time()
+        api_time = end_time - start_time
+        if response.status_code == 200:
+            restored_text = response.json().get("restored_text")
+            return restored_text, api_time
+        else:
+            return f"API Error: {response.status_code} - {response.text}", api_time
+    except Exception as e:
+        return f"Connection Error: {str(e)}", 0.0
+def dummy_restore_punctuation(text):
+    """Dummy API call for demonstration when real API is not available"""
+    time.sleep(0.5)  # Simulate API delay
+    # Simple dummy logic - add some punctuations randomly for demo
+    words = text.split()
+    if len(words) > 5:
+        words[2] = words[2] + ','
+        words[-1] = words[-1] + '?'
+    elif len(words) > 2:
+        words[-1] = words[-1] + '!'
+    return ' '.join(words), 0.5
+def tokenize_with_punctuation(text):
+    """Tokenize text keeping punctuation separate using chunk-based approach"""
+    tokens = []
+    chunks = re.split(f"([{re.escape(''.join(allowed_punctuations))}])", text)
+    for chunk in chunks:
+        if not chunk.strip():
+            continue
+        if chunk in allowed_punctuations:
+            # This is a punctuation
+            tokens.append(chunk)
+        else:
+            # This is text, split into words
+            words = chunk.strip().split()
+            for word in words:
+                if word.strip():
+                    tokens.append(word.strip())
+    return tokens
+def compare_texts(ground_truth, predicted):
+    """Compare ground truth and predicted text token by token with proper alignment"""
+    gt_tokens = tokenize_with_punctuation(ground_truth)
+    pred_tokens = tokenize_with_punctuation(predicted)
+    comparison_result = []
+    correct_puncs = {}
+    wrong_puncs = {}
+    gt_punc_counts = {}
+    # Count punctuations in ground truth
+    for token in gt_tokens:
+        if token in allowed_punctuations:
+            punc_name = punc_dict[token]
+            gt_punc_counts[punc_name] = gt_punc_counts.get(punc_name, 0) + 1
+    # Separate words and punctuations for better alignment
+    gt_words = [token for token in gt_tokens if token not in allowed_punctuations]
+    pred_words = [token for token in pred_tokens if token not in allowed_punctuations]
+    # Create position maps for punctuations
+    gt_punct_map = {}  # word_index -> [punctuations after this word]
+    pred_punct_map = {}  # word_index -> [punctuations after this word]
+    # Build ground truth punctuation map
+    word_idx = -1
+    for i, token in enumerate(gt_tokens):
+        if token not in allowed_punctuations:
+            word_idx += 1
+        else:
+            if word_idx not in gt_punct_map:
+                gt_punct_map[word_idx] = []
+            gt_punct_map[word_idx].append(token)
+    # Build predicted punctuation map
+    word_idx = -1
+    for i, token in enumerate(pred_tokens):
+        if token not in allowed_punctuations:
+            word_idx += 1
+        else:
+            if word_idx not in pred_punct_map:
+                pred_punct_map[word_idx] = []
+            pred_punct_map[word_idx].append(token)
+    # Compare words and punctuations
+    max_words = max(len(gt_words), len(pred_words))
+    for i in range(max_words):
+        # Add word
+        if i < len(gt_words) and i < len(pred_words):
+            if gt_words[i] == pred_words[i]:
+                comparison_result.append((gt_words[i], "correct", "black"))
+            else:
+                comparison_result.append((f"{gt_words[i]}→{pred_words[i]}", "word_diff", "orange"))
+        elif i < len(gt_words):
+            comparison_result.append((f"{gt_words[i]}→''", "missing_word", "red"))
+        elif i < len(pred_words):
+            comparison_result.append((f"''→{pred_words[i]}", "extra_word", "red"))
+        # Compare punctuations after this word
+        gt_puncs = gt_punct_map.get(i, [])
+        pred_puncs = pred_punct_map.get(i, [])
+        # Handle punctuation comparison
+        max_puncs = max(len(gt_puncs), len(pred_puncs))
+        for j in range(max_puncs):
+            if j < len(gt_puncs) and j < len(pred_puncs):
+                gt_punc = gt_puncs[j]
+                pred_punc = pred_puncs[j]
+                if gt_punc == pred_punc:
+                    punc_name = punc_dict[gt_punc]
+                    correct_puncs[punc_name] = correct_puncs.get(punc_name, 0) + 1
+                    comparison_result.append((gt_punc, "correct", "green"))
+                else:
+                    # Wrong punctuation
+                    punc_name = punc_dict[gt_punc]
+                    wrong_puncs[punc_name] = wrong_puncs.get(punc_name, 0) + 1
+                    comparison_result.append((f"{gt_punc}→{pred_punc}", "wrong_punct", "red"))
+            elif j < len(gt_puncs):
+                # Missing punctuation
+                gt_punc = gt_puncs[j]
+                punc_name = punc_dict[gt_punc]
+                wrong_puncs[punc_name] = wrong_puncs.get(punc_name, 0) + 1
+                comparison_result.append((f"{gt_punc}→''", "missing_punct", "red"))
+            elif j < len(pred_puncs):
+                # Extra punctuation (not counted in wrong_puncs since it's not in GT)
+                pred_punc = pred_puncs[j]
+                comparison_result.append((f"''→{pred_punc}", "extra_punct", "red"))
+    return comparison_result, correct_puncs, wrong_puncs, gt_punc_counts
+def create_evaluation_table(correct_puncs, wrong_puncs, gt_punc_counts):
+    """Create evaluation table"""
+    table_data = []
+    for punc_name in gt_punc_counts.keys():
+        correct_count = correct_puncs.get(punc_name, 0)
+        wrong_count = wrong_puncs.get(punc_name, 0)
+        total_count = gt_punc_counts[punc_name]
+        table_data.append([
+            punc_name,
+            correct_count,
+            wrong_count,
+            total_count
+        ])
+    df = pd.DataFrame(table_data, columns=[
+        "Punctuation Name",
+        "Correctly Classified",
+        "Wrongly Classified",
+        "Count in Ground Truth"
+    ])
+    return df
+def format_comparison_html(comparison_result):
+    """Format comparison result as HTML with improved display"""
+    html = "<div style='font-family: monospace; font-size: 16px; line-height: 1.8; padding: 20px; border: 1px solid #ddd; border-radius: 5px;'>"
+    for token, status, color in comparison_result:
+        if status == "correct" and color == "green":
+            # Correct punctuation
+            html += f"<span style='background-color: #d4edda; color: #155724; padding: 2px 4px; margin: 1px; border-radius: 3px; font-weight: bold;'>{token}</span>"
+        elif color == "red":
+            # Incorrect, missing, or extra punctuation/word
+            if "→''" in token:
+                # Missing punctuation or word
+                missing_item = token.split("→")[0]
+                html += f"<span style='background-color: #f8d7da; color: #721c24; padding: 2px 4px; margin: 1px; border-radius: 3px; font-weight: bold;'>{missing_item}→∅</span>"
+            elif "''→" in token:
+                # Extra punctuation or word
+                extra_item = token.split("→")[1]
+                html += f"<span style='background-color: #f8d7da; color: #721c24; padding: 2px 4px; margin: 1px; border-radius: 3px; font-weight: bold;'>∅→{extra_item}</span>"
+            else:
+                # Wrong punctuation/word
+                html += f"<span style='background-color: #f8d7da; color: #721c24; padding: 2px 4px; margin: 1px; border-radius: 3px; font-weight: bold;'>{token}</span>"
+        elif color == "orange":
+            # Word difference
+            html += f"<span style='background-color: #fff3cd; color: #856404; padding: 2px 4px; margin: 1px; border-radius: 3px;'>{token}</span>"
+        else:
+            # Correct word
+            html += f"<span style='padding: 2px 4px; margin: 1px;'>{token}</span>"
+        # Add space after each token
+        html += " "
+    html += "</div>"
+    # Add legend
+    html += """
+    <div style='margin-top: 15px; padding: 10px; background-color: #f8f9fa; border-radius: 5px; font-size: 14px;'>
+        <strong>Legend:</strong><br>
+        <span style='background-color: #d4edda; color: #155724; padding: 1px 3px; border-radius: 2px; margin: 2px;'>✓</span> Correct punctuation &nbsp;
+        <span style='background-color: #f8d7da; color: #721c24; padding: 1px 3px; border-radius: 2px; margin: 2px;'>✗</span> Wrong/Missing/Extra punctuation &nbsp;
+        <span style='background-color: #fff3cd; color: #856404; padding: 1px 3px; border-radius: 2px; margin: 2px;'>~</span> Word difference &nbsp;
+        <span style='padding: 1px 3px; margin: 2px;'>◦</span> Correct word<br>
+        <strong>∅</strong> = Empty/Missing
+    </div>
+    """
+    return html
+def process_punctuation_restoration(input_text, ground_truth=""):
+    """Main processing function"""
+    if not input_text.strip():
+        return "Please enter input text", "", "", None, ""
+    # Make API call (using dummy for demonstration)
+    try:
+        # Try real API first
+        predicted_text, api_time = restore_punctuation(input_text)
+        if "Error" in str(predicted_text):
+            # Fall back to dummy API
+            # predicted_text, api_time = dummy_restore_punctuation(input_text)
+            predicted_text, api_time = f"Error : {input_text}", 999999
+    except:
+        # Fall back to dummy API
+        # predicted_text, api_time = dummy_restore_punctuation(input_text)
+        predicted_text, api_time = f"Error : {input_text}", 999999
+    time_info = f"API call completed in {api_time:.3f} seconds"
+    predicted_text = predicted_text[0] if isinstance(predicted_text, list) else predicted_text
+    print(f"input_text: {input_text}", flush=True)
+    print(f"predicted_text: {predicted_text}", flush=True)
+    if not ground_truth.strip():
+        return predicted_text, "", time_info, None, ""
+    # Normalize ground truth
+    ground_truth_normalized = clean_and_normalize_text(ground_truth)
+    # Compare texts
+    comparison_result, correct_puncs, wrong_puncs, gt_punc_counts = compare_texts(
+        ground_truth_normalized, predicted_text
+    )
+    # Create comparison HTML
+    comparison_html = format_comparison_html(comparison_result)
+    # Create evaluation table
+    eval_table = create_evaluation_table(correct_puncs, wrong_puncs, gt_punc_counts)
+    return predicted_text, comparison_html, time_info, eval_table, f"Normalized Ground Truth: {ground_truth_normalized}"
+# Create Gradio interface
+def create_interface():
+    with gr.Blocks(title="Punctuation Restoration Evaluator", theme=gr.themes.Soft()) as app:
+        gr.Markdown("# 🔤 Punctuation Restoration Evaluator")
+        gr.Markdown("Enter text to restore punctuation. Optionally provide ground truth for evaluation.")
+        with gr.Row():
+            with gr.Column(scale=1):
+                input_text = gr.Textbox(
+                    label="Input Text (without punctuation)",
+                    placeholder="পুরুষের সংখ্যা মোট জনসংখ্যার ৫২ এবং নারীর সংখ্যা ৪৮ শহরের সাক্ষরতার হার কত",
+                    lines=4
+                )
+                ground_truth = gr.Textbox(
+                    label="Ground Truth (optional)",
+                    placeholder="পুরুষের সংখ্যা মোট জনসংখ্যার ৫২, এবং নারীর সংখ্যা ৪৮। শহরের সাক্ষরতার হার কত?",
+                    lines=4
+                )
+                submit_btn = gr.Button("🚀 Restore Punctuation", variant="primary")
+        with gr.Column(scale=2):
+            api_time = gr.Textbox(label="⏱️ API Response Time", interactive=False)
+            predicted_output = gr.Textbox(
+                label="📝 Predicted Output",
+                lines=3,
+                interactive=False
+            )
+            normalized_gt = gr.Textbox(
+                label="📋 Normalized Ground Truth",
+                lines=2,
+                interactive=False
+            )
+            comparison_output = gr.HTML(
+                label="🔍 Token-wise Comparison",
+                value="<p>Comparison will appear here after processing with ground truth.</p>"
+            )
+            evaluation_table = gr.Dataframe(
+                label="📊 Punctuation Evaluation Metrics",
+                headers=["Punctuation Name", "Correctly Classified", "Wrongly Classified", "Count in Ground Truth"],
+                interactive=False
+            )
+        # Legend
+        gr.Markdown("""
+        ### 🎨 Color Legend:
+        - 🟢 **Green**: Correctly predicted punctuation
+        - 🔴 **Red**: Incorrectly predicted, missing, or extra punctuation/word
+        - 🟡 **Orange**: Word-level differences
+        - ⚫ **Black**: Correct words/tokens
+        - **∅**: Empty/Missing (instead of showing word→word or punct→word)
+        """)
+        submit_btn.click(
+            fn=process_punctuation_restoration,
+            inputs=[input_text, ground_truth],
+            outputs=[predicted_output, comparison_output, api_time, evaluation_table, normalized_gt]
+        )
+        # Example section
+        gr.Markdown("### 📚 Example")
+        gr.Examples(
+            examples=[
+                [
+                    "পুরুষের সংখ্যা মোট জনসংখ্যার ৫২ এবং নারীর সংখ্যা ৪৮ শহরের সাক্ষরতার হার কত",
+                    "পুরুষের সংখ্যা মোট জনসংখ্যার ৫২, এবং নারীর সংখ্যা ৪৮। শহরের সাক্ষরতার হার কত?"
+                ],
+                [
+                    "ক্রিকেট বিশ্বের কাছে নিজের আগামীবার তা ভালোভাবেই পৌঁছে দিলেন পাকিস্তানের পেসার আমের জামান",
+                    ""
+                ]
+            ],
+            inputs=[input_text, ground_truth]
+        )
+    return app
+if __name__ == "__main__":
+    app = create_interface()
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        debug=True
+    )

config.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from transformers import *
+# special tokens indices in different models available in transformers
+TOKEN_IDX = {
+    'bert': {
+        'START_SEQ': 101,
+        'PAD': 0,
+        'END_SEQ': 102,
+        'UNK': 100
+    },
+    'xlm': {
+        'START_SEQ': 0,
+        'PAD': 2,
+        'END_SEQ': 1,
+        'UNK': 3
+    },
+    'roberta': {
+        'START_SEQ': 0,
+        'PAD': 1,
+        'END_SEQ': 2,
+        'UNK': 3
+    },
+    'albert': {
+        'START_SEQ': 2,
+        'PAD': 0,
+        'END_SEQ': 3,
+        'UNK': 1
+    },
+}
+# 'O' -> No punctuation
+punctuation_dict = {
+    '0': 0,
+    "DARI": 1,
+    "COMMA": 2,
+    "SEMICOLON": 3,
+    "QUESTION": 4,
+    "EXCLAMATION": 5,
+    "COLON": 6,
+    "HYPHEN": 7,
+}
+punctuation_map = {
+    0: "",
+    1: '।', # 'DARI'
+    2: ',', # 'COMMA'
+    3: ';', # 'SEMICOLON'
+    4: '?', # 'QUESTION'
+    5: '!', # 'EXCLAMATION'
+    6: ':', # 'COLON'
+    7: '-', # 'HYPHEN'
+}
+# pretrained model name: (model class, model tokenizer, output dimension, token style)
+MODELS = {
+    'bert-base-uncased': (BertModel, BertTokenizer, 768, 'bert'),
+    'bert-large-uncased': (BertModel, BertTokenizer, 1024, 'bert'),
+    'bert-base-multilingual-cased': (BertModel, BertTokenizer, 768, 'bert'),
+    'bert-base-multilingual-uncased': (BertModel, BertTokenizer, 768, 'bert'),
+    'sagorsarker/bangla-bert-base': (BertModel, BertTokenizer, 768, 'bert'),
+    # 'distilbert-base-multilingual-cased': (AutoModelForMaskedLM, AutoTokenizer, 768, 'bert'),
+    'xlm-mlm-en-2048': (XLMModel, XLMTokenizer, 2048, 'xlm'),
+    'xlm-mlm-100-1280': (XLMModel, XLMTokenizer, 1280, 'xlm'),
+    'roberta-base': (RobertaModel, RobertaTokenizer, 768, 'roberta'),
+    'roberta-large': (RobertaModel, RobertaTokenizer, 1024, 'roberta'),
+    'neuralspace-reverie/indic-transformers-bn-roberta': (RobertaModel, RobertaTokenizer, 768, 'roberta'),
+    'distilbert-base-uncased': (DistilBertModel, DistilBertTokenizer, 768, 'bert'),
+    'distilbert-base-multilingual-cased': (DistilBertModel, DistilBertTokenizer, 768, 'bert'),
+    './distilbert-base-multilingual-cased': (DistilBertModel, DistilBertTokenizer, 768, 'bert'),
+    'xlm-roberta-base': (XLMRobertaModel, XLMRobertaTokenizer, 768, 'roberta'),
+    'xlm-roberta-large': (XLMRobertaModel, XLMRobertaTokenizer, 1024, 'roberta'),
+    'albert-base-v1': (AlbertModel, AlbertTokenizer, 768, 'albert'),
+    'albert-base-v2': (AlbertModel, AlbertTokenizer, 768, 'albert'),
+    'albert-large-v2': (AlbertModel, AlbertTokenizer, 1024, 'albert'),
+}

distilbert-base-multilingual-cased/config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForMaskedLM"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "vocab_size": 119547
+}

distilbert-base-multilingual-cased/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

distilbert-base-multilingual-cased/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"do_lower_case": false, "model_max_length": 512}

distilbert-base-multilingual-cased/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

entrypoint.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/bin/bash
+# Start the API in background
+python api_onnx.py &
+# Wait briefly to make sure API is up
+sleep 5
+# Start the Gradio UI (on port 5685)
+python app.py

inference_onnx.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import torch
+import numpy as np
+from typing import List, Union, Dict, Any
+from config import *
+def get_encoded_input_single(text, tokenizer, token_style, sequence_len = 256):
+    """Process a single text sequence - matches your conversion code logic"""
+    words = text.split()
+    word_pos = 0
+    x = [TOKEN_IDX[token_style]['START_SEQ']]
+    y_mask = [0]
+    while len(x) < sequence_len and word_pos < len(words):
+        tokens = tokenizer.tokenize(words[word_pos])
+        if len(tokens) + len(x) >= sequence_len:
+            break
+        else:
+            for i in range(len(tokens) - 1):
+                x.append(tokenizer.convert_tokens_to_ids(tokens[i]))
+                y_mask.append(0)
+            x.append(tokenizer.convert_tokens_to_ids(tokens[-1]))
+            y_mask.append(1)
+            word_pos += 1
+    x.append(TOKEN_IDX[token_style]['END_SEQ'])
+    y_mask.append(0)
+    # Pad to sequence_len
+    if len(x) < sequence_len:
+        x = x + [TOKEN_IDX[token_style]['PAD'] for _ in range(sequence_len - len(x))]
+        y_mask = y_mask + [0 for _ in range(sequence_len - len(y_mask))]
+    attn_mask = [1 if token != TOKEN_IDX[token_style]['PAD'] else 0 for token in x]
+    return {
+        'input_values': x,
+        'attention_mask': attn_mask,
+        'y_mask': y_mask
+    }
+def get_encoded_input_batch(texts, tokenizer, token_style, sequence_len = 256):
+    """Process a batch of text sequences - matches your conversion code logic"""
+    batch_data = []
+    for text in texts:
+        encoded = get_encoded_input_single(text, tokenizer, token_style, sequence_len)
+        batch_data.append(encoded)
+    # Stack all sequences into batch tensors
+    batch_input_values = torch.tensor([item['input_values'] for item in batch_data])
+    batch_attention_mask = torch.tensor([item['attention_mask'] for item in batch_data])
+    batch_y_mask = torch.tensor([item['y_mask'] for item in batch_data])
+    encoded_input = {
+        'input_values': batch_input_values,
+        'attention_mask': batch_attention_mask,
+        'y_mask': batch_y_mask
+    }
+    return encoded_input
+def run_onnx_inference(input_values, attention_mask, session):
+    """Run ONNX inference with the unified model"""
+    # Get input/output names
+    input_values_name = session.get_inputs()[0].name
+    attention_mask_name = session.get_inputs()[1].name
+    output_name = session.get_outputs()[0].name
+    # Prepare inputs for ONNX (convert to numpy)
+    inputs = {
+        input_values_name: input_values.cpu().numpy(),
+        attention_mask_name: attention_mask.cpu().numpy()
+    }
+    # Run inference
+    output = session.run([output_name], inputs)
+    predictions = torch.tensor(output[0])  # Shape: [batch_size, seq_len, num_classes]
+    predictions = torch.argmax(predictions, dim=2)  # Shape: [batch_size, seq_len]
+    return predictions
+def get_transcription_batch(texts, session, tokenizer, device, token_style):
+    """Process multiple texts and return punctuated results"""
+    # Prepare batch data
+    encoded_batch = get_encoded_input_batch(texts, tokenizer, token_style)
+    # Move to device
+    input_values = encoded_batch['input_values'].to(device)
+    attention_mask = encoded_batch['attention_mask'].to(device)
+    y_masks = encoded_batch['y_mask']
+    # Run batch inference
+    predictions = run_onnx_inference(input_values, attention_mask, session)
+    # Post-process results for each text
+    results = []
+    for text_idx, text in enumerate(texts):
+        words_original_case = text.split()
+        y_mask = y_masks[text_idx]
+        y_predict = predictions[text_idx]
+        result = ""
+        decode_idx = 0
+        for i in range(y_mask.shape[0]):
+            if y_mask[i] == 1 and decode_idx < len(words_original_case):
+                result += words_original_case[decode_idx] + punctuation_map[y_predict[i].item()] + ' '
+                decode_idx += 1
+        results.append(result.strip())
+    return results
+def get_transcription(text_or_texts, session, tokenizer, device, token_style):
+    """
+    Main function that handles both single text and batch processing
+    Uses the unified ONNX model for both cases
+    Args:
+        text_or_texts: Single text string or list of text strings
+    Returns:
+        Single punctuated string or list of punctuated strings
+    """
+    if isinstance(text_or_texts, str):
+        return get_transcription_batch([text_or_texts], session, tokenizer, device, token_style)
+    elif isinstance(text_or_texts, list):
+        return get_transcription_batch(text_or_texts, session, tokenizer, device, token_style)
+    else:
+        raise ValueError("Input must be either a string or a list of strings")
+if __name__ == '__main__':
+    import time
+    test_text = 'ক্রিকেট বিশ্বের কাছে নিজের আগামীবার তা ভালো���াবেই পৌঁছে দিলেন পাকিস্তানের পেসার আমের জামান চতুর্দশ পাকিস্তানি বোলার হিসেবে অভিষেকেই তুলে নিলেন ছয় উইকেট'
+    print("Testing single text processing:")
+    print("=" * 50)
+    # Test single text processing
+    for i in range(3):
+        start_time = time.time()
+        result = get_transcription(test_text)
+        end_time = time.time()
+        print(f"Run {i+1}: {end_time - start_time:.4f}s")
+    print(f"\nSingle result: {result[:100]}...")
+    print("\nTesting batch text processing:")
+    print("=" * 50)
+    # Test batch processing
+    batch_texts = [
+        'ক্রিকেট বিশ্বের কাছে নিজের আগামীবার তা ভালোভাবেই পৌঁছে দিলেন পাকিস্তানের পেসার আমের জামান চতুর্দশ পাকিস্তানি বোলার হিসেবে অভিষেকেই তুলে নিলেন ছয় উইকেট',
+        'ক্রিকেট বিশ্বের কাছে নিজের আগামীবার তা ভালোভাবেই পৌঁছে দিলেন পাকিস্তানের পেসার আমের জামান চতুর্দশ পাকিস্তানি বোলার হিসেবে অভিষেকেই তুলে নিলেন ছয় উইকেট',
+    ]
+    start_time = time.time()
+    batch_results = get_transcription(batch_texts)
+    end_time = time.time()
+    print(f"Batch processing time: {end_time - start_time:.4f}s")
+    print(f"Processed {len(batch_texts)} texts")
+    print(f"Average time per text: {(end_time - start_time) / len(batch_texts):.4f}s")
+    for i, result in enumerate(batch_results):
+        print(f"Text {i+1}: {result[:50]}...")

poc_onnx_model_punctuation_batch.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:72f36708c26dee2494269930d59e64e09f142ee2749082806b6fc5fb6d13e511
+size 576918507

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+transformers==4.20.1
+gradio
+requests
+pandas
+fastapi
+uvicorn
+onnxruntime-gpu
+numpy
+sacremoses==0.1.1