File size: 2,105 Bytes
073bb25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
import boto3
from pathlib import Path
from botocore.exceptions import ClientError
from tqdm import tqdm
import sys

AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
ENDPOINT_URL = os.getenv("SUPABASE_STORAGE_ENDPOINT") 
BUCKET_NAME = os.getenv("SUPABASE_BUCKET")
REGION = os.getenv("AWS_REGION")

s3 = boto3.client(
    "s3",
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    endpoint_url=ENDPOINT_URL,
    region_name=REGION
)

def upload_pdfs(folder_path: str):

    response = s3.list_objects_v2(Bucket=BUCKET_NAME)
    file_list = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.pdf')]

    folder = Path(folder_path)

    if not folder.exists() or not folder.is_dir():
        print("❌ Invalid folder path.")
        return

    pdf_files = list(folder.glob("*.pdf"))

    if not pdf_files:
        print("Folder exists, but no PDF files were found.")
        return

    for file_path in tqdm(pdf_files):
        key = file_path.name
        print(f"πŸ“„ Uploading: {key}")

        if key in file_list:
            print(f"βœ… {key} already exists in the bucket, skipping.")
            continue
        else:
            try:
                s3.upload_file(
                    Filename=str(file_path),
                    Bucket=BUCKET_NAME,
                    Key=key,
                    ExtraArgs={"ContentType": "application/pdf"},
                )
            except ClientError as e:
                print(f"❌ Error uploading {key}: {e}")

    print("βœ… Upload complete. Run `make sync-bucket` to process the files.")



def main():
    if len(sys.argv) > 1:
        folder = sys.argv[1]
    else:
        print("No folder path provided. Using default: backend/data/rag_articles")
        folder = input("πŸ“‚ Enter path to folder with PDFs (normally backend/data/rag_articles)): ").strip()

    print(f"Using folder: {folder}")
    upload_pdfs(folder)
    # Your existing logic here, using `folder`

if __name__ == "__main__":
    main()