import sqlite3 import os from datetime import datetime def initialize_database(): """ Initialize the SQLite database and create the 'documents' table if it doesn't exist. This function performs the following steps: 1. Connects to the SQLite database (or creates it if it doesn't exist). 2. Creates the 'documents' table with the following columns: - `id`: An auto-incrementing primary key. - `text`: The main text content of the document (required, non-nullable). - `topics`: A string representing associated topics (optional). - `date`: A timestamp indicating when the row was inserted (default: current timestamp). 3. Commits the changes and closes the connection. The `date` column is automatically populated with the current timestamp when a new row is inserted. Example: -------- >>> initialize_database() # Creates or updates the 'dataset.db' file with the 'documents' table schema. """ # Connect to the SQLite database (or create it if it doesn't exist) conn = sqlite3.connect('dataset.db') cursor = conn.cursor() # Create the 'documents' table if it doesn't exist cursor.execute(''' CREATE TABLE IF NOT EXISTS documents ( id INTEGER PRIMARY KEY AUTOINCREMENT, text TEXT NOT NULL, topics TEXT, date TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') # Commit changes and close the connection conn.commit() conn.close() from huggingface_hub import HfApi def commit_to_huggingface(): """Commit the dataset.db file to the Hugging Face Space repository.""" api_token = os.getenv("hf_key") api = HfApi(token=api_token) # Replace with your Space's repository name repo_id = "Danielrahmai1991/dataset_interface" # Upload and commit the dataset.db file api.upload_file( path_or_fileobj="dataset.db", path_in_repo="dataset.db", repo_id=repo_id, repo_type="space" ) def save_to_db(chunks, topics=None): """ Save chunks of text to the SQLite database. This function performs the following steps: 1. Ensures the database and 'documents' table are initialized by calling `initialize_database`. 2. Connects to the SQLite database. 3. Inserts each chunk of text into the 'documents' table along with associated topics. - The `text` column stores the chunk of text. - The `topics` column stores the associated topics (optional). - The `date` column is automatically populated with the current timestamp when the row is inserted. 4. Commits the changes and closes the connection. 5. Calls `commit_to_huggingface` to synchronize the database with an external repository (if applicable). Parameters: ---------- chunks : list of str A list of text chunks to be saved to the database. topics : str or None, optional A string representing the topics associated with the chunks. Defaults to None. Example: -------- >>> save_to_db(["This is the first chunk.", "This is the second chunk."], "Example Topics") # Saves two rows to the 'documents' table with the provided text and topics. """ # Ensure the database and table are initialized initialize_database() # Connect to the database conn = sqlite3.connect('dataset.db') cursor = conn.cursor() # Insert chunks into the database for chunk in chunks: cursor.execute('INSERT INTO documents (text, topics) VALUES (?, ?)', (chunk, topics)) # Commit changes and close the connection conn.commit() conn.close() commit_to_huggingface()