Spaces:

flax-community
/

dalle-mini

Running

App Files Files Community

khalidsaifullaah commited on Jul 8, 2021

Commit

a8e4fc0

1 Parent(s): 8b9d1f5

CC3M downloader script updated

Browse files

Files changed (1) hide show

data/CC3M_downloader.py +46 -140

data/CC3M_downloader.py CHANGED Viewed

@@ -1,156 +1,62 @@
-# It expects you to have the train and validation `.tsv` file downloaded in the current directory
-# Head around to this link to download the `.tsv` files
-# https://ai.google.com/research/ConceptualCaptions/download
 '''
-This script was adapted from https://github.com/igorbrigadir/DownloadConceptualCaptions
-Few changes were made post that (excluding the post processing of data). We'll have
-only csv file with image url and captions written in different languages but not images
-as we do not own any of the images in the dataset and hence cannot legally provide them to you.
 '''
 import pandas as pd
-import numpy as np
 import requests
-import zlib
-import os
-import shelve
-import magic
 from multiprocessing import Pool
 from tqdm import tqdm
-headers = {
-    'User-Agent':'Googlebot-Image/1.0', # Pretend to be googlebot
-    'X-Forwarded-For': '64.18.15.200'
-}
-def _df_split_apply(tup_arg):
-    split_ind, subset, func = tup_arg
-    r = subset.apply(func, axis=1)
-    return (split_ind, r)
-def df_multiprocess(df, processes, chunk_size, func, dataset_name):
-    print("Generating parts...")
-    with shelve.open('%s_%s_%s_results.tmp' % (dataset_name, func.__name__, chunk_size)) as results:
-        pbar = tqdm(total=len(df), position=0)
-        # Resume:
-        finished_chunks = set([int(k) for k in results.keys()])
-        pbar.desc = "Resuming"
-        for k in results.keys():
-            pbar.update(len(results[str(k)][1]))
-        pool_data = ((index, df[i:i + chunk_size], func) for index, i in enumerate(range(0, len(df), chunk_size)) if index not in finished_chunks)
-        print(int(len(df) / chunk_size), "parts.", chunk_size, "per part.", "Using", processes, "processes")
-        pbar.desc = "Downloading"
-        with Pool(processes) as pool:
-            for i, result in enumerate(pool.imap_unordered(_df_split_apply, pool_data, 2)):
-                results[str(result[0])] = result
-                pbar.update(len(result[1]))
-        pbar.close()
-    print("Finished Downloading.")
-    return
-# Unique name based on url
-def _file_name(row):
-    return "%s/%s_%s" % (row['folder'], row.name, (zlib.crc32(row['url'].encode('utf-8')) & 0xffffffff))
-# For checking mimetypes separately without download
-def check_mimetype(row):
-    if os.path.isfile(str(row['file'])):
-        row['mimetype'] = magic.from_file(row['file'], mime=True)
-        row['size'] = os.stat(row['file']).st_size
-    return row
-# Don't download image, just check with a HEAD request, can't resume.
-# Can use this instead of download_image to get HTTP status codes.
-def check_download(row):
-    fname = _file_name(row)
     try:
-        # not all sites will support HEAD
-        response = requests.head(row['url'], stream=False, timeout=5, allow_redirects=True, headers=headers)
-        row['status'] = response.status_code
-        row['headers'] = dict(response.headers)
-    except:
-        # log errors later, set error as 408 timeout
-        row['status'] = 408
-        return row
-    if response.ok:
-        row['file'] = fname
-    return row
-def download_image(row):
-    fname = _file_name(row)
-    # Skip Already downloaded, retry others later
-    if os.path.isfile(fname):
-        row['status'] = 200
-        row['file'] = fname
-        row['mimetype'] = magic.from_file(row['file'], mime=True)
-        row['size'] = os.stat(row['file']).st_size
-        return row
-    try:
-        # use smaller timeout to skip errors, but can result in failed downloads
-        response = requests.get(row['url'], stream=False, timeout=10, allow_redirects=True, headers=headers)
-        row['status'] = response.status_code
-        #row['headers'] = dict(response.headers)
     except Exception as e:
-        # log errors later, set error as 408 timeout
-        row['status'] = 408
-        return row
-    if response.ok:
-        try:
-            with open(fname, 'wb') as out_file:
-                # some sites respond with gzip transport encoding
-                response.raw.decode_content = True
-                out_file.write(response.content)
-            row['mimetype'] = magic.from_file(fname, mime=True)
-            row['size'] = os.stat(fname).st_size
-        except:
-            # This is if it times out during a download or decode
-            row['status'] = 408
-            return row
-        row['file'] = fname
-    return row
-def open_tsv(fname, folder):
-    print("Opening %s Data File..." % fname)
-    df = pd.read_csv(fname, sep='\t', names=["caption","url"], usecols=range(1,2))
-    df['folder'] = folder
-    print("Processing", len(df), " Images:")
-    return df
-def df_from_shelve(chunk_size, func, dataset_name):
-    print("Generating Dataframe from results...")
-    with shelve.open('%s_%s_%s_results.tmp' % (dataset_name, func.__name__, chunk_size)) as results:
-        keylist = sorted([int(k) for k in results.keys()])
-        df = pd.concat([results[str(k)][1] for k in keylist], sort=True)
-    return df
-# number of processes in the pool can be larger than cores
-num_processes = 256
-# chunk_size is how many images per chunk per process - changing this resets progress when restarting.
-images_per_part = 200
-'''
-A bunch of them will fail to download, and return web pages instead. These will
-need to be cleaned up later. See downloaded_validation_report.tsv after it downloads
-for HTTP errors. Around 10-11% of images are gone, based on validation set results. Setting
-the user agent could fix some errors too maybe - not sure if any requests are rejected by
-sites based on this.
-'''
-data_name = "validation"
-df = open_tsv("Validation_GCC-1.1.0-Validation.tsv", data_name)
-df_multiprocess(df=df, processes=num_processes, chunk_size=images_per_part, func=download_image, dataset_name=data_name)
-df = df_from_shelve(chunk_size=images_per_part, func=download_image, dataset_name=data_name)
-df.to_csv("downloaded_%s_report.tsv.gz" % data_name, compression='gzip', sep='\t', header=False, index=False)
-print("Saved.")
-data_name = "training"
-df = open_tsv("Train-GCC-training.tsv",data_name)
-df_multiprocess(df=df, processes=num_processes, chunk_size=images_per_part, func=download_image, dataset_name=data_name)
-df = df_from_shelve(chunk_size=images_per_part, func=download_image, dataset_name=data_name)
-df.to_csv("downloaded_%s_report.tsv.gz" % data_name, compression='gzip', sep='\t', header=False, index=False)
-print("Saved.")

 '''
+This script was adapted from Luke Melas-Kyriazi's code. (https://twitter.com/lukemelas)
+Few changes were made for the particular dataset. You're required to have the `.tsv` file downloaded in your directory.
+Find them here- [https://github.com/google-research-datasets/conceptual-captions]
 '''
+import sys
+import os
+from datetime import datetime
 import pandas as pd
+import contexttimer
+from urllib.request import urlopen
 import requests
+from PIL import Image
+import torch
+from torchvision.transforms import functional as TF
 from multiprocessing import Pool
 from tqdm import tqdm
+import logging
+import sys
+# Setup
+logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
+requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
+if len(sys.argv) != 3:
+    print("Provide .tsv file name & output directory. e.g. python downloader.py Train-GCC-training.tsv training")
+    exit(1)
+# Load data
+print(f'Starting to load at {datetime.now().isoformat(timespec="minutes")}')
+with contexttimer.Timer(prefix="Loading from tsv"):
+    df = pd.read_csv(sys.argv[1], delimiter='\t', header=None)
+url_to_idx_map = {url: index for index, caption, url in df.itertuples()}
+print(f'Loaded {len(url_to_idx_map)} urls')
+base_dir = os.path.join(os.getcwd(), sys.argv[2])
+def process(item):
+    url, image_id = item
     try:
+        base_url = os.path.basename(url)  # extract base url
+        stem, ext = os.path.splitext(base_url)  # split into stem and extension
+        filename = f'{image_id:08d}---{stem}.jpg'  # create filename
+        filepath = os.path.join(base_dir, filename)  # concat to get filepath
+        if not os.path.isfile(filepath):
+            req = requests.get(url, stream=True, timeout=1, verify=False).raw
+            image = Image.open(req).convert('RGB')
+            if min(image.size) > 512:
+                image = TF.resize(image, size=512, interpolation=Image.LANCZOS)
+            image.save(filepath)  # save PIL image
     except Exception as e:
+        logging.info(" ".join(repr(e).splitlines()))
+        logging.error(url)
+list_of_items = list(url_to_idx_map.items())
+print(len(list_of_items))
+with Pool(128) as p:
+    r = list(tqdm(p.imap(process, list_of_items), total=len(list_of_items)))
+    print('DONE')