File size: 3,676 Bytes
06a53dc
d90bac0
2aef697
06a53dc
 
2aef697
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06a53dc
 
 
 
 
 
 
 
 
2aef697
 
06a53dc
 
 
 
 
 
 
b97fe69
 
 
 
d90bac0
 
b97fe69
 
e0683ca
b97fe69
 
 
 
 
 
 
 
 
 
e0683ca
06a53dc
2aef697
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06a53dc
 
 
 
 
 
 
 
 
 
 
 
 
b97fe69
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import sqlite3
import os
from datetime import datetime

def initialize_database():
    """
    Initialize the SQLite database and create the 'documents' table if it doesn't exist.

    This function performs the following steps:
    1. Connects to the SQLite database (or creates it if it doesn't exist).
    2. Creates the 'documents' table with the following columns:
       - `id`: An auto-incrementing primary key.
       - `text`: The main text content of the document (required, non-nullable).
       - `topics`: A string representing associated topics (optional).
       - `date`: A timestamp indicating when the row was inserted (default: current timestamp).
    3. Commits the changes and closes the connection.

    The `date` column is automatically populated with the current timestamp when a new row is inserted.

    Example:
    --------
    >>> initialize_database()
    # Creates or updates the 'dataset.db' file with the 'documents' table schema.
    """
    # Connect to the SQLite database (or create it if it doesn't exist)
    conn = sqlite3.connect('dataset.db')
    cursor = conn.cursor()

    # Create the 'documents' table if it doesn't exist
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS documents (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            text TEXT NOT NULL,
            topics TEXT,
            date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    ''')

    # Commit changes and close the connection
    conn.commit()
    conn.close()

from huggingface_hub import HfApi

def commit_to_huggingface():
    """Commit the dataset.db file to the Hugging Face Space repository."""
    api_token = os.getenv("hf_key")
    api = HfApi(token=api_token)

    # Replace with your Space's repository name
    repo_id = "Danielrahmai1991/dataset_interface"

    # Upload and commit the dataset.db file
    api.upload_file(
        path_or_fileobj="dataset.db",
        path_in_repo="dataset.db",
        repo_id=repo_id,
        repo_type="space"
    )



def save_to_db(chunks, topics=None):
    """
    Save chunks of text to the SQLite database.

    This function performs the following steps:
    1. Ensures the database and 'documents' table are initialized by calling `initialize_database`.
    2. Connects to the SQLite database.
    3. Inserts each chunk of text into the 'documents' table along with associated topics.
       - The `text` column stores the chunk of text.
       - The `topics` column stores the associated topics (optional).
       - The `date` column is automatically populated with the current timestamp when the row is inserted.
    4. Commits the changes and closes the connection.
    5. Calls `commit_to_huggingface` to synchronize the database with an external repository (if applicable).

    Parameters:
    ----------
    chunks : list of str
        A list of text chunks to be saved to the database.
    topics : str or None, optional
        A string representing the topics associated with the chunks. Defaults to None.

    Example:
    --------
    >>> save_to_db(["This is the first chunk.", "This is the second chunk."], "Example Topics")
    # Saves two rows to the 'documents' table with the provided text and topics.
    """
    # Ensure the database and table are initialized
    initialize_database()

    # Connect to the database
    conn = sqlite3.connect('dataset.db')
    cursor = conn.cursor()

    # Insert chunks into the database
    for chunk in chunks:
        cursor.execute('INSERT INTO documents (text, topics) VALUES (?, ?)', (chunk, topics))

    # Commit changes and close the connection
    conn.commit()
    conn.close()
    commit_to_huggingface()