import os import sqlite3 import pandas as pd from nltk.tokenize import word_tokenize import re # Function to chunk text into specified size with overlap and keep track of timestamps def chunk_text_with_timestamps(text, start_ts, end_ts, chunk_size=256, overlap=0.5): words = word_tokenize(text) chunks = [] step = int(chunk_size * (1 - overlap)) num_chunks = (len(words) - chunk_size + step) // step for i in range(0, num_chunks * step, step): chunk = words[i:i + chunk_size] if len(chunk) < chunk_size: break chunk_start_ts = start_ts chunk_end_ts = end_ts # Placeholder for real calculation, you might need to calculate it based on word timings chunks.append((' '.join(chunk), chunk_start_ts, chunk_end_ts)) return chunks # Function to read VTT files from the database def read_vtt_files_from_db(db_path): conn = sqlite3.connect(db_path) cursor = conn.cursor() cursor.execute("SELECT folder_name, file_name FROM vtt_files") vtt_files = cursor.fetchall() conn.close() return vtt_files # Function to process VTT file and extract chunks with timestamps def process_vtt_file(file_path, chunk_size=256, overlap=0.5): with open(file_path, 'r') as file: vtt_data = file.read() # Regular expression to match the VTT format pattern = r"(\d+)\n(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\n(.*?): (.*?)\n" matches = re.findall(pattern, vtt_data, re.DOTALL) # Merge text by the same user and create chunks with timestamps data = [] current_user = None current_text = "" start_ts = None for match in matches: id, start, end, user, text = match if user != current_user: if current_user is not None: data.append((current_user, current_text, start_ts, previous_end)) current_user = user current_text = text start_ts = start else: current_text += " " + text previous_end = end if current_user is not None: data.append((current_user, current_text, start_ts, previous_end)) chunks = [] chunk_id = 1 for user, text, start_ts, end_ts in data: text_chunks = chunk_text_with_timestamps(text, start_ts, end_ts, chunk_size, overlap) for chunk, chunk_start_ts, chunk_end_ts in text_chunks: chunks.append((chunk_id, chunk, chunk_start_ts, chunk_end_ts, user)) chunk_id += 1 return chunks # Function to save chunks to the database def save_chunks_to_db(db_path, folder_name, file_name, chunks): conn = sqlite3.connect(db_path) cursor = conn.cursor() cursor.execute(''' CREATE TABLE IF NOT EXISTS text_chunks ( id INTEGER PRIMARY KEY, talkname TEXT, filename TEXT, chunkid INTEGER, chunk TEXT, start_ts TEXT, end_ts TEXT, username TEXT ) ''') for chunk_id, chunk, chunk_start_ts, chunk_end_ts, user in chunks: cursor.execute(''' INSERT INTO text_chunks (talkname, filename, chunkid, chunk, start_ts, end_ts, username) VALUES (?, ?, ?, ?, ?, ?, ?) ''', (folder_name, file_name, chunk_id, chunk, chunk_start_ts, chunk_end_ts, user)) conn.commit() conn.close() # Main script to process all VTT files and save chunks root_dir = '/Users/t0mkaka/Desktop/Network/vtt_files' # Replace with the path to your root folder db_path = 'rag.db' # Replace with the path to your SQLite database vtt_files = read_vtt_files_from_db(db_path) for folder_name, file_name in vtt_files: file_path = os.path.join(root_dir, file_name) if os.path.exists(file_path): chunks = process_vtt_file(file_path) save_chunks_to_db(db_path, folder_name, file_name, chunks) print("Processed and saved all text chunks with timestamps and usernames to the database.")