instant-smollm

Running

File size: 6,201 Bytes

<script lang="ts">
    import Textarea from "@/lib/components/ui/textarea/textarea.svelte";
    import Badge from "@/lib/components/ui/badge/badge.svelte";
    import * as webllm from "@mlc-ai/web-llm";
    import { onMount, tick } from 'svelte';

    let selectedModel = "smollm-360M-instruct-add-basics-q0f32-MLC";

    let engine: webllm.MLCEngineInterface;
    let isLoading = false;
    let loadingStatus = '';
    let inputText = '';
    let outputText = '';
    let error = '';
    let completionSpeed: number | null = null;
    let tokensPerSecond: number | null = null;
    let isGenerating = false;
    let pendingRequest: string | null = null;

    const promptExamples = [
        "What is the capital of France?",
        "Tell me a story about a cat.",
        "Write a poem about the ocean.",
    ]

    async function setPrompt(prompt: string) {
        inputText = prompt;
        generateCompletion(prompt);
    }

    async function loadWebLLM() {
        isLoading = true;
        error = '';
        const initProgressCallback = (report: webllm.InitProgressReport) => {
            loadingStatus = report.text;
        };

        const appConfig: webllm.AppConfig = {
            model_list: [{
                model: `https://huggingface.co/reach-vb/smollm-360M-instruct-add-basics-q0f16-MLC`,
                model_id: 'smollm-360M-instruct-add-basics-q0f32-MLC',
                model_lib: `${webllm.modelLibURLPrefix}${webllm.modelVersion}/SmolLM-360M-Instruct-q0f16-ctx2k_cs1k-webgpu.wasm`,
                overrides: { context_window_size: 2048 },
            },
            {
                model: `https://huggingface.co/mlc-ai/Qwen2-0.5B-Instruct-q4f16_1-MLC`,
                model_id: 'Qwen2-0.5B-Instruct-q4f16_1-MLC',
                model_lib: `${webllm.modelLibURLPrefix}${webllm.modelVersion}/Qwen2-0.5B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm`,
                overrides: { context_window_size: 2048 },
            }
        ],
        };

        try {
            engine = await webllm.CreateMLCEngine(selectedModel, {
                appConfig,
                initProgressCallback,
                logLevel: "INFO",
            });
        } catch (err) {
            error = `Failed to load the model: ${(err as Error).message}`;
        } finally {
            isLoading = false;
        }
    }

    async function generateCompletion(content: string) {
        if (!engine || isGenerating) {
            /**
             * This is used to store the most recent request from user
             * while the current request is being processed.
             */
            pendingRequest = content.trim();
            return;
        }

        if (!content.trim()) return;

        isGenerating = true;
        const startTime = performance.now();
        try {
            console.log("Generating completion:", content);
            const response = await engine.chat.completions.create({
                messages: [
                    {role: "user", content: content}
                ],
                max_tokens: 15,
            });

            outputText = response.choices[0].message.content || "";

            // indicate that the response was cut short
            if (response.choices[0].finish_reason === "length") {
                outputText += "...";
            }

            const endTime = performance.now();
            const elapsedTimeInSeconds = (endTime - startTime) / 1000;
            completionSpeed = Math.round(endTime - startTime);
            
            const generatedTokens = response.usage?.completion_tokens || 0;
            tokensPerSecond = Math.round(generatedTokens / elapsedTimeInSeconds);
            
            error = '';
        } catch (err) {
            error = `Error: ${(err as Error).message}`;
        } finally {
            isGenerating = false;
            
            // process pending request if exists
            if (pendingRequest && pendingRequest !== content) {
                const nextRequest = pendingRequest;
                pendingRequest = null;
                await generateCompletion(nextRequest);
            }
        }
    }

    onMount(loadWebLLM);
</script>

<div class="flex my-20 flex-col items-center gap-4 max-w-xl mx-auto">
    <h1 class="text-center font-mono font-bold text-4xl">SmolLM 🤗</h1>
    <p class="text-center font-mono text-sm mb-4">Powered by <a href="https://huggingface.co/mlc-ai" target="_blank" class="underline text-blue-500">MLC</a> WebLLM <a class="underline text-blue-500" href="https://huggingface.co/HuggingFaceTB/smollm-360M-instruct-add-basics" target="_blank">SmolLM-360M-Instruct-Add-Basics</a> <span class="text-xs italic">(15 Max Tokens)</span></p>    

    <Textarea 
        bind:value={inputText}
        on:input={() => generateCompletion(inputText)}
        disabled={isLoading}
        class="w-full text-lg" 
        placeholder="Say something..."
    />
    <p class="text-center text-xs italic">This model doesn't work well with extremely creative prompts.</p>
    {#if isLoading}
        <p class="text-sm text-slate-600 text-center">{loadingStatus}</p>
    {:else if error}
        <p class="text-sm text-red-600">{error}</p>
    {:else}
        <div class="flex gap-2">
            {#if completionSpeed !== null}
                <Badge>{completionSpeed}ms</Badge>
            {/if}
            {#if tokensPerSecond !== null}
                <Badge>{tokensPerSecond} tok/s</Badge>
            {/if}
        </div>
    {/if}
    <div class="flex flex-col items-center mb-4">
        {#if inputText === ''}
        <p class="text-sm mb-2">Try these examples:</p>
        <div class="flex flex-wrap justify-center gap-2">
            {#each promptExamples as prompt}
                <button on:click={() => setPrompt(prompt)}>
                    <Badge
                        variant="outline"
                        class="cursor-pointer bg-orange-100 hover:bg-orange-200"
                    >
                        {prompt}
                    </Badge>
                </button>
            {/each}
        </div>
        {/if}
    </div>
    <pre class="text-xl font-bold whitespace-pre-wrap">{outputText}</pre>

</div>