c17hawke commited on
Commit
576f178
·
1 Parent(s): dd61940

stage 01 completed

Browse files
Files changed (1) hide show
  1. src/stage_01.py +46 -0
src/stage_01.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import shutil
4
+ from tqdm import tqdm
5
+ import logging
6
+ from src.utils.common import read_yaml, create_directories
7
+ from datasets import load_dataset
8
+
9
+ STAGE = "stage 01" ## <<< change stage name
10
+
11
+ logging.basicConfig(
12
+ filename=os.path.join("logs", 'running_logs.log'),
13
+ level=logging.INFO,
14
+ format="[%(asctime)s: %(levelname)s: %(module)s]: %(message)s",
15
+ filemode="a"
16
+ )
17
+
18
+
19
+ def main(config_path, params_path):
20
+ ## read config files
21
+ config = read_yaml(config_path)
22
+ params = read_yaml(params_path)
23
+ artifacts = config["artifacts"]
24
+
25
+ dataset = params["train"]["dataset_name"]
26
+ cache_dir_ = artifacts["cache_dir"]
27
+ create_directories([cache_dir_])
28
+ # print(dataset, cache_dir)
29
+ logging.info(f"load dataset")
30
+ datasets = load_dataset(dataset, cache_dir=cache_dir_)
31
+ logging.info(f"dataset saved in : {cache_dir_}")
32
+
33
+ if __name__ == '__main__':
34
+ args = argparse.ArgumentParser()
35
+ args.add_argument("--config", "-c", default="configs/config.yaml")
36
+ args.add_argument("--params", "-p", default="params.yaml")
37
+ parsed_args = args.parse_args()
38
+
39
+ try:
40
+ logging.info("\n********************")
41
+ logging.info(f">>>>> stage {STAGE} started <<<<<")
42
+ main(config_path=parsed_args.config, params_path=parsed_args.params)
43
+ logging.info(f">>>>> stage {STAGE} completed!<<<<<\n")
44
+ except Exception as e:
45
+ logging.exception(e)
46
+ raise e