Skip to content

helpers

Dataset related tasks

Downloader

A rudimentary URL downloader (like wget or curl) to demonstrate Rich progress bars.

Adapted from here

__init__(self, njobs=None, api_key=None, chunk_size=1048576) special

Parameters:

Name Type Description Default
njobs int

number of parallel downloads

None
api_key str

AIcrowd API Key

None
chunk_size int

no. of bytes to process in each iteration

1048576
Source code in aicrowd/dataset/helpers.py
def __init__(
    self, njobs: int = None, api_key: str = None, chunk_size: int = 1024 * 1024
):
    """
    Args:
        njobs: number of parallel downloads
        api_key: AIcrowd API Key
        chunk_size: no. of bytes to process in each iteration
    """
    self.progress_bar = TqdmProgressBar()

    if njobs is None:
        njobs = max(os.cpu_count() // 2, 1)

    self.njobs = njobs
    self.api_key = api_key
    # need this to indicate when the thread should die
    self.__active = False
    self.chunk_size = self.get_chunk_size(chunk_size)

    self.log = logging.getLogger()

copy_url(self, filename, url, path)

Copy data from a url to a local file

Parameters:

Name Type Description Default
url str

url for the file to be downloaded

required
path str

path where the file will be saved

required
Source code in aicrowd/dataset/helpers.py
def copy_url(self, filename: str, url: str, path: str) -> None:
    """
    Copy data from a url to a local file

    Args:
        url: url for the file to be downloaded
        path: path where the file will be saved
    """
    req = Request(url)
    if urlparse(url).netloc == DATASETS_HOST:
        req.add_unredirected_header("Authorization", f"Token {self.api_key}")

    try:
        response = urlopen(req)
    except HTTPError as e:
        self.log.error("Error in downloading dataset %s.\n%s", url, e)
        click.echo(click.style(f"Error in downloading dataset {url}", fg="red"))
        return

    # This will break if the response doesn't contain content length
    task_id = self.progress_bar.add(
        filename=filename,
        total=int(response.info().get("Content-length")),
    )

    with open(path, "wb") as dest_file:
        for data in iter(partial(response.read, self.chunk_size), b""):
            # parent has quit, die
            if not self.__active:
                return

            dest_file.write(data)
            self.progress_bar.update(progress_bar_id=task_id, step=len(data))

    self.log.info("File %s downloaded successfully", dest_file)
    self.progress_bar.close(progress_bar_id=task_id)

download(self, urls, dest_dir)

Download multuple files to the given directory

Parameters:

Name Type Description Default
urls Iterable[str]

list of urls from which files are to be downloaded

required
dest_dir str

downloaded files will end up here

required
Source code in aicrowd/dataset/helpers.py
def download(self, urls: Iterable[str], dest_dir: str):
    """
    Download multuple files to the given directory

    Args:
        urls: list of urls from which files are to be downloaded
        dest_dir: downloaded files will end up here
    """
    self.__active = True

    # https://stackoverflow.com/q/29177490
    #
    # The threads created by ThreadPoolExecutor are daemon
    # Normally, when running as CLI, the parent dies and so do the threads
    #
    # When this is run inside a notebook as a magic command, this doesn't happen
    #   which means that threads are
    #    - still downloading files
    #    - continuously updating the progress bars

    try:
        with ThreadPoolExecutor(max_workers=self.njobs) as pool:
            for url in urls:
                filename = urlparse(url).path.split("/")[-1]
                dest_path = os.path.join(dest_dir, filename)
                pool.submit(self.copy_url, filename, url, dest_path)
    except KeyboardInterrupt:
        # the thread will read this
        self.__active = False
        raise

get_chunk_size(chunk_size) staticmethod

Set chunk size of 100 MB if running on Google colab

Parameters:

Name Type Description Default
chunk_size int

default size to return if not on colab

required
Source code in aicrowd/dataset/helpers.py
@staticmethod
def get_chunk_size(chunk_size: int) -> int:
    """
    Set chunk size of 100 MB if running on Google colab

    Args:
        chunk_size: default size to return if not on colab
    """
    if is_google_colab_env():
        return 32 * 1024 * 1024

    return chunk_size

get_datasets(challenge_id, api_key)

Queries AIcrowd API for datasets of this challenge

Parameters:

Name Type Description Default
challenge_id int

challenge id

required
api_key str

AIcrowd API Key

required

Returns:

Type Description
List[dict]

Datasets for a particular challenge

Source code in aicrowd/dataset/helpers.py
def get_datasets(challenge_id: int, api_key: str) -> List[dict]:
    """
    Queries AIcrowd API for datasets of this challenge

    Args:
        challenge_id: challenge id
        api_key: AIcrowd API Key

    Returns:
        Datasets for a particular challenge
    """
    log = logging.getLogger()

    r = get_challenge_datasets(api_key, challenge_id)

    if not r.ok:
        log.error("Request to API failed\nReason: %s\nMessage: %s", r.reason, r.text)
        return [{}]

    try:
        return r.json()
    except Exception as e:
        log.error("Parsing response failed\n---\n%s\n---", e)
        return [{}]

get_file_indices(picked_file, dataset_files)

Returns the index of the picked file in the dataset if the provided input is not an integer.

Parameters:

Name Type Description Default
picked_file str

glob pattern for files to be downloaded

required
dataset_files List[Dict[str, str]] required
Source code in aicrowd/dataset/helpers.py
def get_file_indices(
    picked_file: str, dataset_files: List[Dict[str, str]]
) -> List[int]:
    """
    Returns the index of the picked file in the dataset if the provided input is not an integer.

    Args:
        picked_file: glob pattern for files to be downloaded
        dataset_files:
    """
    try:
        idx = int(picked_file)
        return [idx]
    except ValueError:
        matched_indices = []
        for idx, dataset_file in enumerate(dataset_files):
            if fnmatch.fnmatch(dataset_file.get("title", ""), picked_file):
                matched_indices.append(idx)
    return matched_indices

humanize_size(size)

Returns the file size (inp=bytes) in a human readable format

Parameters:

Name Type Description Default
size int

size in bytes

required

Returns:

Type Description
str

size in human readable format

Source code in aicrowd/dataset/helpers.py
def humanize_size(size: int) -> str:
    """
    Returns the file size (inp=bytes) in a human readable format

    Args:
        size: size in bytes

    Returns:
        size in human readable format
    """
    try:
        size = float(size)
    except:
        return size

    for unit in ["B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB"]:
        if abs(size) < 1000:
            return f"{size:.2f} {unit}"

        size /= 1000

    return f"{size:.2f} YB"