davanstrien HF staff commited on
Commit
7f66f08
·
1 Parent(s): d102592

Refactor code to add createdAt field and filter

Browse files
Files changed (1) hide show
  1. app.py +19 -12
app.py CHANGED
@@ -1,16 +1,17 @@
1
  import os
2
  from datetime import datetime, timedelta
 
3
  from typing import Any, Dict
4
 
5
  import gradio as gr
6
  import pandas as pd
 
7
  from diskcache import Cache
8
  from dotenv import load_dotenv
9
  from httpx import Client
10
  from huggingface_hub import DatasetCard, hf_hub_url, list_datasets
11
  from tqdm.auto import tqdm
12
  from tqdm.contrib.concurrent import thread_map
13
- from cachetools import TTLCache, cached
14
 
15
  load_dotenv()
16
 
@@ -42,19 +43,24 @@ client = Client(
42
  cache = TTLCache(maxsize=10, ttl=CACHE_TIME)
43
 
44
 
 
 
 
 
 
 
 
 
 
 
45
  def add_created_data(dataset):
46
  _id = dataset._id
47
- created = datetime.fromtimestamp(int(_id[:8], 16))
48
  dataset_dict = dataset.__dict__
49
- dataset_dict["created"] = created
50
  return dataset_dict
51
 
52
 
53
- def get_three_months_ago():
54
- now = datetime.now()
55
- return now - timedelta(days=90)
56
-
57
-
58
  def get_readme_len(dataset: Dict[str, Any]):
59
  try:
60
  url = hf_hub_url(dataset["id"], "README.md", repo_type="dataset")
@@ -110,7 +116,8 @@ def get_datasets():
110
  def load_data():
111
  datasets = get_datasets()
112
  datasets = [add_created_data(dataset) for dataset in tqdm(datasets)]
113
- filtered = [ds for ds in datasets if ds["created"] > get_three_months_ago()]
 
114
  ds_with_len = thread_map(get_readme_len, filtered)
115
  ds_with_len = [ds for ds in ds_with_len if ds is not None]
116
  ds_with_valid_status = thread_map(has_server_preview, ds_with_len)
@@ -122,7 +129,7 @@ columns_to_drop = [
122
  "cardData",
123
  "gated",
124
  "sha",
125
- "paperswithcode_id",
126
  "tags",
127
  "description",
128
  "siblings",
@@ -150,11 +157,11 @@ def prep_dataframe(remove_orgs_and_users=REMOVE_ORGS, columns_to_drop=columns_to
150
 
151
 
152
  def filter_df_by_max_age(df, max_age_days=None):
153
- df = df.dropna(subset=["created"])
154
  now = datetime.now()
155
  if max_age_days is not None:
156
  max_date = now - timedelta(days=max_age_days)
157
- df = df[df["created"] >= max_date]
158
  return df
159
 
160
 
 
1
  import os
2
  from datetime import datetime, timedelta
3
+ from sys import platform
4
  from typing import Any, Dict
5
 
6
  import gradio as gr
7
  import pandas as pd
8
+ from cachetools import TTLCache, cached
9
  from diskcache import Cache
10
  from dotenv import load_dotenv
11
  from httpx import Client
12
  from huggingface_hub import DatasetCard, hf_hub_url, list_datasets
13
  from tqdm.auto import tqdm
14
  from tqdm.contrib.concurrent import thread_map
 
15
 
16
  load_dotenv()
17
 
 
43
  cache = TTLCache(maxsize=10, ttl=CACHE_TIME)
44
 
45
 
46
+ def get_three_months_ago():
47
+ now = datetime.now()
48
+ return now - timedelta(days=90)
49
+
50
+
51
+ def parse_date(date_str):
52
+ # parse the created date from string 2023-11-17T16:39:54.000Z to datetime
53
+ return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%fZ")
54
+
55
+
56
  def add_created_data(dataset):
57
  _id = dataset._id
58
+ created = parse_date(dataset.createdAt)
59
  dataset_dict = dataset.__dict__
60
+ dataset_dict["createdAt"] = created
61
  return dataset_dict
62
 
63
 
 
 
 
 
 
64
  def get_readme_len(dataset: Dict[str, Any]):
65
  try:
66
  url = hf_hub_url(dataset["id"], "README.md", repo_type="dataset")
 
116
  def load_data():
117
  datasets = get_datasets()
118
  datasets = [add_created_data(dataset) for dataset in tqdm(datasets)]
119
+ # datasets = [dataset.__dict__ for dataset in tqdm(datasets)]
120
+ filtered = [ds for ds in datasets if ds["createdAt"] > get_three_months_ago()]
121
  ds_with_len = thread_map(get_readme_len, filtered)
122
  ds_with_len = [ds for ds in ds_with_len if ds is not None]
123
  ds_with_valid_status = thread_map(has_server_preview, ds_with_len)
 
129
  "cardData",
130
  "gated",
131
  "sha",
132
+ # "paperswithcode_id",
133
  "tags",
134
  "description",
135
  "siblings",
 
157
 
158
 
159
  def filter_df_by_max_age(df, max_age_days=None):
160
+ df = df.dropna(subset=["createdAt"])
161
  now = datetime.now()
162
  if max_age_days is not None:
163
  max_date = now - timedelta(days=max_age_days)
164
+ df = df[df["createdAt"] >= max_date]
165
  return df
166
 
167