helpers
Dataset related tasks
Downloader
¶
A rudimentary URL downloader (like wget or curl) to demonstrate Rich progress bars.
Adapted from here
__init__(self, njobs=None, api_key=None, chunk_size=1048576)
special
¶
Parameters:
Name | Type | Description | Default |
---|---|---|---|
njobs |
int |
number of parallel downloads |
None |
api_key |
str |
AIcrowd API Key |
None |
chunk_size |
int |
no. of bytes to process in each iteration |
1048576 |
Source code in aicrowd/dataset/helpers.py
def __init__(
self, njobs: int = None, api_key: str = None, chunk_size: int = 1024 * 1024
):
"""
Args:
njobs: number of parallel downloads
api_key: AIcrowd API Key
chunk_size: no. of bytes to process in each iteration
"""
self.progress_bar = TqdmProgressBar()
if njobs is None:
njobs = max(os.cpu_count() // 2, 1)
self.njobs = njobs
self.api_key = api_key
# need this to indicate when the thread should die
self.__active = False
self.chunk_size = self.get_chunk_size(chunk_size)
self.log = logging.getLogger()
copy_url(self, filename, url, path)
¶
Copy data from a url to a local file
Parameters:
Name | Type | Description | Default |
---|---|---|---|
url |
str |
url for the file to be downloaded |
required |
path |
str |
path where the file will be saved |
required |
Source code in aicrowd/dataset/helpers.py
def copy_url(self, filename: str, url: str, path: str) -> None:
"""
Copy data from a url to a local file
Args:
url: url for the file to be downloaded
path: path where the file will be saved
"""
req = Request(url)
if urlparse(url).netloc == DATASETS_HOST:
req.add_unredirected_header("Authorization", f"Token {self.api_key}")
try:
response = urlopen(req)
except HTTPError as e:
self.log.error("Error in downloading dataset %s.\n%s", url, e)
click.echo(click.style(f"Error in downloading dataset {url}", fg="red"))
return
# This will break if the response doesn't contain content length
task_id = self.progress_bar.add(
filename=filename,
total=int(response.info().get("Content-length")),
)
with open(path, "wb") as dest_file:
for data in iter(partial(response.read, self.chunk_size), b""):
# parent has quit, die
if not self.__active:
return
dest_file.write(data)
self.progress_bar.update(progress_bar_id=task_id, step=len(data))
self.log.info("File %s downloaded successfully", dest_file)
self.progress_bar.close(progress_bar_id=task_id)
download(self, urls, dest_dir)
¶
Download multuple files to the given directory
Parameters:
Name | Type | Description | Default |
---|---|---|---|
urls |
Iterable[str] |
list of urls from which files are to be downloaded |
required |
dest_dir |
str |
downloaded files will end up here |
required |
Source code in aicrowd/dataset/helpers.py
def download(self, urls: Iterable[str], dest_dir: str):
"""
Download multuple files to the given directory
Args:
urls: list of urls from which files are to be downloaded
dest_dir: downloaded files will end up here
"""
self.__active = True
# https://stackoverflow.com/q/29177490
#
# The threads created by ThreadPoolExecutor are daemon
# Normally, when running as CLI, the parent dies and so do the threads
#
# When this is run inside a notebook as a magic command, this doesn't happen
# which means that threads are
# - still downloading files
# - continuously updating the progress bars
try:
with ThreadPoolExecutor(max_workers=self.njobs) as pool:
for url in urls:
filename = urlparse(url).path.split("/")[-1]
dest_path = os.path.join(dest_dir, filename)
pool.submit(self.copy_url, filename, url, dest_path)
except KeyboardInterrupt:
# the thread will read this
self.__active = False
raise
get_chunk_size(chunk_size)
staticmethod
¶
Set chunk size of 100 MB if running on Google colab
Parameters:
Name | Type | Description | Default |
---|---|---|---|
chunk_size |
int |
default size to return if not on colab |
required |
Source code in aicrowd/dataset/helpers.py
@staticmethod
def get_chunk_size(chunk_size: int) -> int:
"""
Set chunk size of 100 MB if running on Google colab
Args:
chunk_size: default size to return if not on colab
"""
if is_google_colab_env():
return 32 * 1024 * 1024
return chunk_size
get_datasets(challenge_id, api_key)
¶
Queries AIcrowd API for datasets of this challenge
Parameters:
Name | Type | Description | Default |
---|---|---|---|
challenge_id |
int |
challenge id |
required |
api_key |
str |
AIcrowd API Key |
required |
Returns:
Type | Description |
---|---|
List[dict] |
Datasets for a particular challenge |
Source code in aicrowd/dataset/helpers.py
def get_datasets(challenge_id: int, api_key: str) -> List[dict]:
"""
Queries AIcrowd API for datasets of this challenge
Args:
challenge_id: challenge id
api_key: AIcrowd API Key
Returns:
Datasets for a particular challenge
"""
log = logging.getLogger()
r = get_challenge_datasets(api_key, challenge_id)
if not r.ok:
log.error("Request to API failed\nReason: %s\nMessage: %s", r.reason, r.text)
return [{}]
try:
return r.json()
except Exception as e:
log.error("Parsing response failed\n---\n%s\n---", e)
return [{}]
get_file_indices(picked_file, dataset_files)
¶
Returns the index of the picked file in the dataset if the provided input is not an integer.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
picked_file |
str |
glob pattern for files to be downloaded |
required |
dataset_files |
List[Dict[str, str]] |
required |
Source code in aicrowd/dataset/helpers.py
def get_file_indices(
picked_file: str, dataset_files: List[Dict[str, str]]
) -> List[int]:
"""
Returns the index of the picked file in the dataset if the provided input is not an integer.
Args:
picked_file: glob pattern for files to be downloaded
dataset_files:
"""
try:
idx = int(picked_file)
return [idx]
except ValueError:
matched_indices = []
for idx, dataset_file in enumerate(dataset_files):
if fnmatch.fnmatch(dataset_file.get("title", ""), picked_file):
matched_indices.append(idx)
return matched_indices
humanize_size(size)
¶
Returns the file size (inp=bytes) in a human readable format
Parameters:
Name | Type | Description | Default |
---|---|---|---|
size |
int |
size in bytes |
required |
Returns:
Type | Description |
---|---|
str |
size in human readable format |
Source code in aicrowd/dataset/helpers.py
def humanize_size(size: int) -> str:
"""
Returns the file size (inp=bytes) in a human readable format
Args:
size: size in bytes
Returns:
size in human readable format
"""
try:
size = float(size)
except:
return size
for unit in ["B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB"]:
if abs(size) < 1000:
return f"{size:.2f} {unit}"
size /= 1000
return f"{size:.2f} YB"