Downloaders

downloaders #

HuggingFaceDownloader #

HuggingFaceDownloader(repo_id: str)

Bases: imgnet.download.base.BaseDownloader

Source code in src/imgnet/download/downloaders.py

def __init__(self, repo_id: str) -> None:
    self.repo_id = repo_id

download #

download(output_path: pathlib.Path, instance_ids: list[str] | None = None, **kwargs: typing.Any) -> None

Download from Hugging Face. Here instance_ids is a list of filenames to download.

Source code in src/imgnet/download/downloaders.py

def download(
    self,
    output_path: Path,
    instance_ids: list[str] | None = None,
    **kwargs: Any,  # noqa: ANN401
) -> None:
    """Download from Hugging Face. Here instance_ids is a list of filenames to download."""

    if instance_ids is None:
        logger.info(f"Downloading all instances from Hugging Face repository {self.repo_id}")
        files_to_download = self.members
    else:
        logger.info(f"Downloading {len(instance_ids)} instances from Hugging Face repository {self.repo_id}")
        remaining = set(instance_ids)
        files_to_download = []

        for file_name in self.members:
            if file_name in remaining:
                files_to_download.append(file_name)
                remaining.remove(file_name)
                continue

            if RemoteArchive.is_supported_archive(file_name) and remaining:
                archive_url = hf_hub_url(
                    repo_id=self.repo_id,
                    filename=file_name,
                    repo_type="dataset",
                )
                archive = RemoteArchive(
                    archive_url, Path(file_name).suffix
                )
                extracted = archive.extract(
                    filenames=sorted(remaining),
                    output_path=output_path,
                )
                remaining -= set(extracted)

        if remaining:
            msg = f"Instance IDs {sorted(remaining)} not found in Hugging Face repository {self.repo_id}"
            logger.warning(msg)

    if len(files_to_download) > 0:
        disable_progress_bars()
        try:
            with tqdm_logging_redirect():
                snapshot_download(
                    repo_id=self.repo_id,
                    local_dir=output_path,
                    tqdm_class=_tqdm,
                    allow_patterns=files_to_download,
                    repo_type="dataset",
                    **kwargs,
                )
        finally:
            enable_progress_bars()

ZenodoDownloader #

ZenodoDownloader(record_id: str)

Bases: imgnet.download.base.BaseDownloader

Source code in src/imgnet/download/downloaders.py

def __init__(self, record_id: str) -> None:
    self.record_id = record_id

download #

download(output_path: pathlib.Path, instance_ids: list[str] | None = None, **kwargs: typing.Any) -> None

Download files from Zenodo. Here instance_ids is a list of filenames to download.

Source code in src/imgnet/download/downloaders.py

def download(
    self,
    output_path: Path,
    instance_ids: list[str] | None = None,
    **kwargs: Any,  # noqa: ANN401
) -> None:
    """Download files from Zenodo. Here instance_ids is a list of filenames to download."""
    output_path.mkdir(parents=True, exist_ok=True)
    files = self.files_info

    if instance_ids is None:
        files_to_download = files
        logger.info(f"Downloading all files from Zenodo record {self.record_id}")
    else:
        logger.info(f"Downloading {len(instance_ids)} files from Zenodo record {self.record_id}")
        remaining = set(instance_ids)
        files_to_download = []

        for file_info in files:
            file_name = file_info["key"]
            if file_name in remaining:
                files_to_download.append(file_info)
                remaining.remove(file_name)
                continue

            if RemoteArchive.is_supported_archive(file_name) and remaining:
                archive = RemoteArchive(
                    file_info["links"]["self"], Path(file_name).suffix
                )
                extracted = archive.extract(
                    filenames=sorted(remaining),
                    output_path=output_path,
                )
                remaining -= set(extracted)

        if remaining:
            msg = f"Instance IDs {sorted(remaining)} not found in Zenodo record {self.record_id}"
            logger.warning(msg)

    for file_info in files_to_download:
        _download_http_file(
            url=file_info["links"]["self"],
            out_file=output_path / file_info["key"],
            desc=file_info["key"],
            size=file_info["size"],
        )

DropboxDownloader #

DropboxDownloader(url: str)

Bases: imgnet.download.base.BaseDownloader

Source code in src/imgnet/download/downloaders.py

def __init__(self, url: str) -> None:
    self.url = url.replace("dl=0", "dl=1").replace(
        "www.dropbox.com", "dl.dropboxusercontent.com"
    )

download #

download(output_path: pathlib.Path, instance_ids: list[str] | None = None, **kwargs: typing.Any) -> None

Download from Dropbox. Supports selecting specific instance_ids from archives.

Source code in src/imgnet/download/downloaders.py

def download(
    self,
    output_path: Path,
    instance_ids: list[str] | None = None,
    **kwargs: Any,  # noqa: ANN401
) -> None:
    """Download from Dropbox. Supports selecting specific instance_ids from archives."""
    output_path.mkdir(parents=True, exist_ok=True)
    file_name = Path(urlparse(self.url).path).name

    if instance_ids is None:
        logger.info(f"Downloading all files from Dropbox source {self.url}")
        _download_http_file(
            url=self.url,
            out_file=output_path / file_name,
            desc=file_name,
        )
        return None

    remaining = set(instance_ids)
    logger.info(f"Downloading {len(remaining)} files from Dropbox source {self.url}")
    if file_name in remaining:
        _download_http_file(
            url=self.url,
            out_file=output_path / file_name,
            desc=file_name,
        )
        remaining.remove(file_name)

    if RemoteArchive.is_supported_archive(file_name) and remaining:
        archive = RemoteArchive(self.url, Path(file_name).suffix)
        extracted = archive.extract(
            filenames=sorted(remaining), output_path=output_path
        )
        remaining -= set(extracted)

    if remaining:
        msg = (
            f"Instance IDs {sorted(remaining)} not found in Dropbox source"
        )
        logger.warning(msg)

IDCDownloader #

IDCDownloader(collection_id: str)

Bases: imgnet.download.base.BaseDownloader

Source code in src/imgnet/download/downloaders.py

def __init__(self, collection_id: str) -> None:
    self.collection_id = (
        collection_id.lower().replace(" ", "_").replace("-", "_")
    )
    self.client = get_idc_client()

download #

download(output_path: pathlib.Path, instance_ids: list[str] | None = None, **kwargs: typing.Any) -> None

Download from IDC. Here instance_ids is a list of series UIDs to download.

Source code in src/imgnet/download/downloaders.py

def download(
    self,
    output_path: Path,
    instance_ids: list[str] | None = None,
    **kwargs: Any,  # noqa: ANN401
) -> None:
    """Download from IDC. Here instance_ids is a list of series UIDs to download."""

    if instance_ids is not None:
        if not all(
            instance_id in self.members for instance_id in instance_ids
        ):
            msg = f"Instance IDs {instance_ids} not found in IDC collection {self.collection_id}"
            raise ValueError(msg)
        series_uids = instance_ids
    else:
        logger.warning(f"No instance IDs provided, downloading all series from IDC collection {self.collection_id}")
        series_uids = self.members

    output_path.mkdir(parents=True, exist_ok=True)
    with tqdm_logging_redirect():
        self.client.download_dicom_series(
            series_uids,
            output_path,
            dirTemplate="%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID",
        )