File size: 1,668 Bytes
b044f34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from fastapi import FastAPI, Query
from datasets import load_dataset
from typing import List

app = FastAPI()

# Load the dataset in streaming mode for memory efficiency
dataset = load_dataset("togethercomputer/RedPajama-Data-1T", streaming=True)

@app.get("/")
def greet_json():
    return {"message": "Welcome to the RedPajama Dataset API"}

@app.get("/get_data/")
def get_data(chunk_size: int = 10):
    """
    Returns a small chunk of the dataset.

    Parameters:
    - chunk_size: The number of examples to return (default: 10).

    Returns:
    - A list of examples from the dataset.
    """
    data_chunk = []
    for i, example in enumerate(dataset["train"]):  # Adjust split if needed
        data_chunk.append(example)
        if i + 1 == chunk_size:
            break
    return {"data": data_chunk}

@app.get("/search_data/")
def search_data(keyword: str, max_results: int = 10):
    """
    Searches the dataset for a specific keyword in the text fields.

    Parameters:
    - keyword: The keyword to search for.
    - max_results: The maximum number of results to return (default: 10).

    Returns:
    - A list of examples containing the keyword.
    """
    results = []
    for example in dataset["train"]:  # Adjust split if needed
        if keyword.lower() in str(example).lower():
            results.append(example)
        if len(results) == max_results:
            break
    return {"results": results}

@app.get("/data_summary/")
def data_summary():
    """
    Provides a basic summary of the dataset.

    Returns:
    - A dictionary with dataset details (e.g., number of splits).
    """
    return {"dataset_splits": dataset.keys()}