|
import sqlite3 |
|
import os |
|
from datetime import datetime |
|
|
|
def initialize_database(): |
|
""" |
|
Initialize the SQLite database and create the 'documents' table if it doesn't exist. |
|
|
|
This function performs the following steps: |
|
1. Connects to the SQLite database (or creates it if it doesn't exist). |
|
2. Creates the 'documents' table with the following columns: |
|
- `id`: An auto-incrementing primary key. |
|
- `text`: The main text content of the document (required, non-nullable). |
|
- `topics`: A string representing associated topics (optional). |
|
- `date`: A timestamp indicating when the row was inserted (default: current timestamp). |
|
3. Commits the changes and closes the connection. |
|
|
|
The `date` column is automatically populated with the current timestamp when a new row is inserted. |
|
|
|
Example: |
|
-------- |
|
>>> initialize_database() |
|
# Creates or updates the 'dataset.db' file with the 'documents' table schema. |
|
""" |
|
|
|
conn = sqlite3.connect('dataset.db') |
|
cursor = conn.cursor() |
|
|
|
|
|
cursor.execute(''' |
|
CREATE TABLE IF NOT EXISTS documents ( |
|
id INTEGER PRIMARY KEY AUTOINCREMENT, |
|
text TEXT NOT NULL, |
|
topics TEXT, |
|
date TIMESTAMP DEFAULT CURRENT_TIMESTAMP |
|
) |
|
''') |
|
|
|
|
|
conn.commit() |
|
conn.close() |
|
|
|
from huggingface_hub import HfApi |
|
|
|
def commit_to_huggingface(): |
|
"""Commit the dataset.db file to the Hugging Face Space repository.""" |
|
api_token = os.getenv("hf_key") |
|
api = HfApi(token=api_token) |
|
|
|
|
|
repo_id = "Danielrahmai1991/dataset_interface" |
|
|
|
|
|
api.upload_file( |
|
path_or_fileobj="dataset.db", |
|
path_in_repo="dataset.db", |
|
repo_id=repo_id, |
|
repo_type="space" |
|
) |
|
|
|
|
|
|
|
def save_to_db(chunks, topics=None): |
|
""" |
|
Save chunks of text to the SQLite database. |
|
|
|
This function performs the following steps: |
|
1. Ensures the database and 'documents' table are initialized by calling `initialize_database`. |
|
2. Connects to the SQLite database. |
|
3. Inserts each chunk of text into the 'documents' table along with associated topics. |
|
- The `text` column stores the chunk of text. |
|
- The `topics` column stores the associated topics (optional). |
|
- The `date` column is automatically populated with the current timestamp when the row is inserted. |
|
4. Commits the changes and closes the connection. |
|
5. Calls `commit_to_huggingface` to synchronize the database with an external repository (if applicable). |
|
|
|
Parameters: |
|
---------- |
|
chunks : list of str |
|
A list of text chunks to be saved to the database. |
|
topics : str or None, optional |
|
A string representing the topics associated with the chunks. Defaults to None. |
|
|
|
Example: |
|
-------- |
|
>>> save_to_db(["This is the first chunk.", "This is the second chunk."], "Example Topics") |
|
# Saves two rows to the 'documents' table with the provided text and topics. |
|
""" |
|
|
|
initialize_database() |
|
|
|
|
|
conn = sqlite3.connect('dataset.db') |
|
cursor = conn.cursor() |
|
|
|
|
|
for chunk in chunks: |
|
cursor.execute('INSERT INTO documents (text, topics) VALUES (?, ?)', (chunk, topics)) |
|
|
|
|
|
conn.commit() |
|
conn.close() |
|
commit_to_huggingface() |
|
|
|
|
|
|