Spaces:
Paused
Paused
Update script1.js
Browse files- script1.js +46 -59
script1.js
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
// Constants and Configuration
|
| 2 |
const USER_SPEECH_INTERRUPT_DELAY = 500;
|
| 3 |
-
const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
|
| 4 |
const CHUNK_SIZE = 300;
|
| 5 |
-
const MAX_PREFETCH_REQUESTS = 5;
|
| 6 |
const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute
|
| 7 |
const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour
|
| 8 |
const WEBCAM_INTERVAL = 5000;
|
| 9 |
-
const MAX_HISTORY_LENGTH = 6;
|
| 10 |
|
| 11 |
// DOM Elements
|
| 12 |
const startStopButton = document.getElementById('startStopButton');
|
|
@@ -17,6 +17,7 @@ const responseTimeDisplay = document.getElementById('responseTime');
|
|
| 17 |
const userActivityIndicator = document.getElementById('userIndicator');
|
| 18 |
const aiActivityIndicator = document.getElementById('aiIndicator');
|
| 19 |
const transcriptDiv = document.getElementById('transcript');
|
|
|
|
| 20 |
|
| 21 |
// Speech Recognition
|
| 22 |
let speechRecognizer;
|
|
@@ -46,6 +47,22 @@ let conversationHistory = [];
|
|
| 46 |
// Audio Caching
|
| 47 |
const audioCache = new Map();
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
// Utility Functions
|
| 50 |
|
| 51 |
// Normalize query text
|
|
@@ -58,7 +75,7 @@ const generateCacheKey = (normalizedQuery, voice, history, modelName) =>
|
|
| 58 |
// Update activity indicators
|
| 59 |
const updateActivityIndicators = (state = null) => {
|
| 60 |
userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
|
| 61 |
-
|
| 62 |
if (isRequestInProgress && !currentAudio) {
|
| 63 |
aiActivityIndicator.textContent = "AI: Processing...";
|
| 64 |
} else if (currentAudio && !isUserSpeaking) {
|
|
@@ -194,7 +211,6 @@ const cancelPrefetchRequests = (query) => {
|
|
| 194 |
|
| 195 |
// Send a query to the AI
|
| 196 |
async function sendQueryToAI(query) {
|
| 197 |
-
console.log("Sending query to AI:", query);
|
| 198 |
isRequestInProgress = true;
|
| 199 |
updateActivityIndicators();
|
| 200 |
firstResponseTextTimestamp = null;
|
|
@@ -210,7 +226,6 @@ async function sendQueryToAI(query) {
|
|
| 210 |
combinedQuery += `, {USER: "${query}"}`;
|
| 211 |
|
| 212 |
await streamAndHandleAudioResponse(combinedQuery, voiceSelectionDropdown.value, requestAbortController.signal);
|
| 213 |
-
|
| 214 |
} catch (error) {
|
| 215 |
if (error.name !== 'AbortError') {
|
| 216 |
console.error("Error sending query to AI:", error);
|
|
@@ -226,7 +241,7 @@ const processSpeechTranscript = (transcript) => {
|
|
| 226 |
const trimmedTranscript = transcript.trimStart();
|
| 227 |
if (trimmedTranscript !== '' && !isRequestInProgress) {
|
| 228 |
activeQuery = trimmedTranscript;
|
| 229 |
-
addToConversationHistory('user', activeQuery);
|
| 230 |
sendQueryToAI(activeQuery);
|
| 231 |
}
|
| 232 |
};
|
|
@@ -330,7 +345,6 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
|
|
| 330 |
let fullResponseText2 = "";
|
| 331 |
let textChunk = "";
|
| 332 |
|
| 333 |
-
|
| 334 |
try {
|
| 335 |
while (true) {
|
| 336 |
const { done, value } = await reader.read();
|
|
@@ -346,34 +360,38 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
|
|
| 346 |
buffer += chunk;
|
| 347 |
const lines = buffer.split('\n');
|
| 348 |
|
| 349 |
-
for (const line of lines) {
|
| 350 |
if (line.startsWith('data: ')) {
|
| 351 |
const textContent = line.substring(6).trim();
|
| 352 |
if (textContent) {
|
| 353 |
if (!firstResponseTextTimestamp) firstResponseTextTimestamp = Date.now();
|
| 354 |
-
|
|
|
|
| 355 |
fullResponseText2 += textContent + " ";
|
| 356 |
textChunk += textContent + " ";
|
| 357 |
transcriptDiv.textContent = fullResponseText2;
|
| 358 |
|
| 359 |
|
| 360 |
if (textChunk.length >= CHUNK_SIZE) {
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
}
|
| 368 |
}
|
| 369 |
}
|
| 370 |
}
|
|
|
|
|
|
|
| 371 |
}
|
| 372 |
} catch (error) {
|
| 373 |
console.error("Error in handleStreamingResponse:", error);
|
| 374 |
} finally {
|
| 375 |
-
|
| 376 |
-
|
|
|
|
| 377 |
const audioUrl = await generateTextToSpeechAudio(textChunk, voice);
|
| 378 |
if (audioUrl) {
|
| 379 |
audioPlaybackQueue.push({ url: audioUrl });
|
|
@@ -381,12 +399,9 @@ const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
|
|
| 381 |
}
|
| 382 |
}
|
| 383 |
|
| 384 |
-
addToConversationHistory('assistant', fullResponseText2);
|
| 385 |
-
fullResponseText = "";
|
| 386 |
-
fullResponseText2 = "";
|
| 387 |
-
|
| 388 |
-
reader.releaseLock();
|
| 389 |
-
|
| 390 |
}
|
| 391 |
};
|
| 392 |
|
|
@@ -484,20 +499,14 @@ if ('webkitSpeechRecognition' in window) {
|
|
| 484 |
speechRecognizer.stop();
|
| 485 |
isSpeechRecognitionActive = false;
|
| 486 |
startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
video.srcObject = null;
|
| 491 |
-
lastCaption = "";
|
| 492 |
-
isCaptioningEnabled = false;
|
| 493 |
-
|
| 494 |
} else {
|
| 495 |
speechRecognizer.start();
|
| 496 |
isSpeechRecognitionActive = true;
|
| 497 |
startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
|
| 498 |
-
|
| 499 |
-
// Start webcam capture when speech recognition starts
|
| 500 |
-
isCaptioningEnabled = true;
|
| 501 |
startWebcam();
|
| 502 |
}
|
| 503 |
});
|
|
@@ -508,28 +517,13 @@ if ('webkitSpeechRecognition' in window) {
|
|
| 508 |
setInterval(updateLatency, 100);
|
| 509 |
|
| 510 |
|
| 511 |
-
|
| 512 |
-
// Webcam Integration
|
| 513 |
-
import { client, handle_file } from 'https://cdn.jsdelivr.net/npm/@gradio/client/+esm';
|
| 514 |
-
|
| 515 |
-
const video = document.getElementById('webcam');
|
| 516 |
-
let app;
|
| 517 |
-
let lastCaption = "";
|
| 518 |
-
|
| 519 |
-
const clients = [
|
| 520 |
-
"multimodalart/Florence-2-l4",
|
| 521 |
-
"gokaygokay/Florence-2",
|
| 522 |
-
"multimodalart/Florence-2-l4-2",
|
| 523 |
-
"gokaygokay/Florence-2",
|
| 524 |
-
];
|
| 525 |
-
|
| 526 |
-
let webcamInterval; // Store the interval ID
|
| 527 |
|
| 528 |
async function startWebcam() {
|
| 529 |
try {
|
| 530 |
const stream = await navigator.mediaDevices.getUserMedia({ video: true });
|
| 531 |
video.srcObject = stream;
|
| 532 |
-
webcamInterval = setInterval(captureAndProcessImage, WEBCAM_INTERVAL);
|
| 533 |
} catch (error) {
|
| 534 |
console.error("Error accessing webcam: ", error);
|
| 535 |
}
|
|
@@ -559,11 +553,4 @@ async function processWithGradio(imageBlob) {
|
|
| 559 |
} catch (error) {
|
| 560 |
console.error("Error processing with Gradio:", error);
|
| 561 |
}
|
| 562 |
-
}
|
| 563 |
-
|
| 564 |
-
window.onload = () => {
|
| 565 |
-
// Start webcam only if speech recognition is active
|
| 566 |
-
if (isCaptioningEnabled) {
|
| 567 |
-
startWebcam();
|
| 568 |
-
}
|
| 569 |
-
};
|
|
|
|
| 1 |
// Constants and Configuration
|
| 2 |
const USER_SPEECH_INTERRUPT_DELAY = 500;
|
| 3 |
+
const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
|
| 4 |
const CHUNK_SIZE = 300;
|
| 5 |
+
const MAX_PREFETCH_REQUESTS = 5;
|
| 6 |
const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute
|
| 7 |
const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour
|
| 8 |
const WEBCAM_INTERVAL = 5000;
|
| 9 |
+
const MAX_HISTORY_LENGTH = 6;
|
| 10 |
|
| 11 |
// DOM Elements
|
| 12 |
const startStopButton = document.getElementById('startStopButton');
|
|
|
|
| 17 |
const userActivityIndicator = document.getElementById('userIndicator');
|
| 18 |
const aiActivityIndicator = document.getElementById('aiIndicator');
|
| 19 |
const transcriptDiv = document.getElementById('transcript');
|
| 20 |
+
const video = document.getElementById('webcam');
|
| 21 |
|
| 22 |
// Speech Recognition
|
| 23 |
let speechRecognizer;
|
|
|
|
| 47 |
// Audio Caching
|
| 48 |
const audioCache = new Map();
|
| 49 |
|
| 50 |
+
// Image Captioning State
|
| 51 |
+
let isCaptioningEnabled = false;
|
| 52 |
+
let lastCaption = "";
|
| 53 |
+
|
| 54 |
+
// Webcam Integration
|
| 55 |
+
import { client, handle_file } from 'https://cdn.jsdelivr.net/npm/@gradio/client/+esm';
|
| 56 |
+
const clients = [
|
| 57 |
+
"multimodalart/Florence-2-l4",
|
| 58 |
+
"gokaygokay/Florence-2",
|
| 59 |
+
"multimodalart/Florence-2-l4-2",
|
| 60 |
+
"gokaygokay/Florence-2",
|
| 61 |
+
];
|
| 62 |
+
let app;
|
| 63 |
+
let webcamInterval;
|
| 64 |
+
|
| 65 |
+
|
| 66 |
// Utility Functions
|
| 67 |
|
| 68 |
// Normalize query text
|
|
|
|
| 75 |
// Update activity indicators
|
| 76 |
const updateActivityIndicators = (state = null) => {
|
| 77 |
userActivityIndicator.textContent = isUserSpeaking ? "User: Speaking" : "User: Idle";
|
| 78 |
+
|
| 79 |
if (isRequestInProgress && !currentAudio) {
|
| 80 |
aiActivityIndicator.textContent = "AI: Processing...";
|
| 81 |
} else if (currentAudio && !isUserSpeaking) {
|
|
|
|
| 211 |
|
| 212 |
// Send a query to the AI
|
| 213 |
async function sendQueryToAI(query) {
|
|
|
|
| 214 |
isRequestInProgress = true;
|
| 215 |
updateActivityIndicators();
|
| 216 |
firstResponseTextTimestamp = null;
|
|
|
|
| 226 |
combinedQuery += `, {USER: "${query}"}`;
|
| 227 |
|
| 228 |
await streamAndHandleAudioResponse(combinedQuery, voiceSelectionDropdown.value, requestAbortController.signal);
|
|
|
|
| 229 |
} catch (error) {
|
| 230 |
if (error.name !== 'AbortError') {
|
| 231 |
console.error("Error sending query to AI:", error);
|
|
|
|
| 241 |
const trimmedTranscript = transcript.trimStart();
|
| 242 |
if (trimmedTranscript !== '' && !isRequestInProgress) {
|
| 243 |
activeQuery = trimmedTranscript;
|
| 244 |
+
addToConversationHistory('user', activeQuery);
|
| 245 |
sendQueryToAI(activeQuery);
|
| 246 |
}
|
| 247 |
};
|
|
|
|
| 345 |
let fullResponseText2 = "";
|
| 346 |
let textChunk = "";
|
| 347 |
|
|
|
|
| 348 |
try {
|
| 349 |
while (true) {
|
| 350 |
const { done, value } = await reader.read();
|
|
|
|
| 360 |
buffer += chunk;
|
| 361 |
const lines = buffer.split('\n');
|
| 362 |
|
| 363 |
+
for (const line of lines) {
|
| 364 |
if (line.startsWith('data: ')) {
|
| 365 |
const textContent = line.substring(6).trim();
|
| 366 |
if (textContent) {
|
| 367 |
if (!firstResponseTextTimestamp) firstResponseTextTimestamp = Date.now();
|
| 368 |
+
|
| 369 |
+
fullResponseText += textContent + " ";
|
| 370 |
fullResponseText2 += textContent + " ";
|
| 371 |
textChunk += textContent + " ";
|
| 372 |
transcriptDiv.textContent = fullResponseText2;
|
| 373 |
|
| 374 |
|
| 375 |
if (textChunk.length >= CHUNK_SIZE) {
|
| 376 |
+
const audioUrl = await generateTextToSpeechAudio(textChunk, voice);
|
| 377 |
+
if (audioUrl) {
|
| 378 |
+
audioPlaybackQueue.push({ url: audioUrl });
|
| 379 |
+
if (!currentAudio) playNextAudio();
|
| 380 |
+
}
|
| 381 |
+
textChunk = "";
|
| 382 |
}
|
| 383 |
}
|
| 384 |
}
|
| 385 |
}
|
| 386 |
+
|
| 387 |
+
buffer = lines[lines.length - 1];
|
| 388 |
}
|
| 389 |
} catch (error) {
|
| 390 |
console.error("Error in handleStreamingResponse:", error);
|
| 391 |
} finally {
|
| 392 |
+
reader.releaseLock();
|
| 393 |
+
|
| 394 |
+
if (textChunk !== "") { // Send any remaining text
|
| 395 |
const audioUrl = await generateTextToSpeechAudio(textChunk, voice);
|
| 396 |
if (audioUrl) {
|
| 397 |
audioPlaybackQueue.push({ url: audioUrl });
|
|
|
|
| 399 |
}
|
| 400 |
}
|
| 401 |
|
| 402 |
+
addToConversationHistory('assistant', fullResponseText2);
|
| 403 |
+
fullResponseText = "";
|
| 404 |
+
fullResponseText2 = "";
|
|
|
|
|
|
|
|
|
|
| 405 |
}
|
| 406 |
};
|
| 407 |
|
|
|
|
| 499 |
speechRecognizer.stop();
|
| 500 |
isSpeechRecognitionActive = false;
|
| 501 |
startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
|
| 502 |
+
clearInterval(webcamInterval);
|
| 503 |
+
video.srcObject = null;
|
| 504 |
+
lastCaption = "";
|
|
|
|
|
|
|
|
|
|
|
|
|
| 505 |
} else {
|
| 506 |
speechRecognizer.start();
|
| 507 |
isSpeechRecognitionActive = true;
|
| 508 |
startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
|
| 509 |
+
isCaptioningEnabled = true;
|
|
|
|
|
|
|
| 510 |
startWebcam();
|
| 511 |
}
|
| 512 |
});
|
|
|
|
| 517 |
setInterval(updateLatency, 100);
|
| 518 |
|
| 519 |
|
| 520 |
+
// Webcam Functions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 521 |
|
| 522 |
async function startWebcam() {
|
| 523 |
try {
|
| 524 |
const stream = await navigator.mediaDevices.getUserMedia({ video: true });
|
| 525 |
video.srcObject = stream;
|
| 526 |
+
webcamInterval = setInterval(captureAndProcessImage, WEBCAM_INTERVAL);
|
| 527 |
} catch (error) {
|
| 528 |
console.error("Error accessing webcam: ", error);
|
| 529 |
}
|
|
|
|
| 553 |
} catch (error) {
|
| 554 |
console.error("Error processing with Gradio:", error);
|
| 555 |
}
|
| 556 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|