Source code for cellmaps_imagedownloader.runner

#! /usr/bin/env python

import os
from multiprocessing import Pool
import re
import csv
import shutil
import logging
import logging.config
import requests
import time
from datetime import date
import warnings
from tqdm import tqdm
from cellmaps_utils import logutils
from cellmaps_utils.provenance import ProvenanceUtil
from cellmaps_utils import constants
import cellmaps_imagedownloader
from cellmaps_imagedownloader.exceptions import CellMapsImageDownloaderError

logger = logging.getLogger(__name__)



[docs]
def download_file_skip_existing(downloadtuple):
    """
    Downloads file in **downloadtuple** unless the file already exists
    with a size greater then 0 bytes, in which case function
    just returns

    :param downloadtuple: (download link, dest file path)
    :type downloadtuple: tuple
    :return: None upon success otherwise:
             (requests status code, text from request, downloadtuple)
    :rtype: tuple
    """
    if os.path.isfile(downloadtuple[1]) and os.path.getsize(downloadtuple[1]) > 0:
        return None
    return download_file(downloadtuple)




[docs]
def download_file(downloadtuple):
    """
    Downloads file pointed to by 'download_url' to
    'destfile'

    .. note::

        Default download function used by :py:class:`~MultiProcessImageDownloader`

    :param downloadtuple: `(download link, dest file path)`
    :type downloadtuple: tuple
    :return: None upon success otherwise:
             `(requests status code, text from request, downloadtuple)`
    :rtype: tuple
    """
    logger.debug('Downloading ' + downloadtuple[0] + ' to ' + downloadtuple[1])
    try:
        with requests.get(downloadtuple[0], stream=True) as r:
            if r.status_code != 200:
                return r.status_code, r.text, downloadtuple
            with open(downloadtuple[1], 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:  # filter out keep-alive new chunks
                        f.write(chunk)
        return None
    except requests.exceptions.HTTPError as e:
        return -1, str(e), downloadtuple
    except requests.exceptions.ConnectionError as e:
        return -2, str(e), downloadtuple
    except requests.exceptions.Timeout as e:
        return -3, str(e), downloadtuple
    except requests.exceptions.RequestException as e:
        return -4, str(e), downloadtuple
    except Exception as e:
        return -5, str(e), downloadtuple




[docs]
class ImageDownloader(object):
    """
    Abstract class that defines interface for classes that download images

    """

    def __init__(self):
        """

        """
        pass


[docs]
    def download_images(self, download_list=None):
        """
        Subclasses should implement

        :param download_list: list of tuples where first element is
                              full URL of image to download and 2nd
                              element is destination path
        :type download_list: list
        :return:
        """
        raise CellMapsImageDownloaderError('Subclasses should implement this')





[docs]
class CM4AICopyDownloader(ImageDownloader):
    """
    Copies over images from CM4AI RO-Crate
    """

    def __init__(self):
        """
        Constructor
        """
        super().__init__()


[docs]
    def download_images(self, download_list=None):
        """
        Subclasses should implement

        :param download_list: list of tuples where first element is
                              full URL of image to download and 2nd
                              element is destination path
        :type download_list: list
        :return:
        """
        num_to_copy = len(download_list)
        logger.info(str(num_to_copy) + ' images to copy')
        t = tqdm(total=num_to_copy, desc='Copy',
                 unit='images')
        for entry in download_list:
            t.update()
            logger.debug('Copying ' + str(entry[0]) + ' to ' + str(entry[1]))
            shutil.copy(entry[0], entry[1])

        return []





[docs]
class FakeImageDownloader(ImageDownloader):
    """
    Creates fake download by downloading
    the first image in each color from
    `Human Protein Atlas <https://www.proteinatlas.org/>`__
    and making renamed copies. The :py:func:`download_file` function
    is used to download the first image of each color

    """

    def __init__(self):
        """
        Constructor

        """
        super().__init__()
        warnings.warn('This downloader generates FAKE images\n'
                      'You have been warned!!!\n'
                      'Have a nice day')


[docs]
    def download_images(self, download_list=None):
        """
        Downloads 1st image from server and then
        and makes renamed copies for subsequent images

        :param download_list:
        :type download_list: list of tuple
        :return:
        """
        num_to_download = len(download_list)
        logger.info(str(num_to_download) + ' images to download')
        t = tqdm(total=num_to_download, desc='Download',
                 unit='images')

        src_image_dict = {}
        # assume 1st four images are the colors for the first image
        for entry in download_list[0:4]:
            t.update()
            if download_file(entry) is not None:
                raise CellMapsImageDownloaderError('Unable to download ' +
                                                   str(entry))
            fname = os.path.basename(entry[1])
            color = re.sub(r'\..*$', '', re.sub(r'^.*_', '', fname))
            src_image_dict[color] = entry[1]

        for entry in download_list[5:]:
            t.update()
            fname = os.path.basename(entry[1])
            color = re.sub(r'\..*$', '', re.sub(r'^.*_', '', fname))
            shutil.copy(src_image_dict[color], entry[1])
        return []





[docs]
class MultiProcessImageDownloader(ImageDownloader):
    """
    Uses multiprocess package to download images in parallel

    """
    POOL_SIZE = 4

    def __init__(self, poolsize=POOL_SIZE, skip_existing=False,
                 override_dfunc=None):
        """
        Constructor

        .. warning::

            Exceeding **poolsize** of ``4`` causes errors from Human Protein Atlas site

        :param poolsize: Number of concurrent downloaders to use.
        :type poolsize: int
        :param skip_existing: If ``True`` skip download if image file exists and has size
                              greater then ``0``
        :type skip_existing: bool
        :param override_dfunc: Function that takes a tuple `(image URL, download str path)`
                               and downloads the image. If ``None`` :py:func:`download_file`
                               function is used
        :type override_dfunc: :py:class:`function`
        """
        super().__init__()
        self._poolsize = poolsize
        if override_dfunc is not None:
            self._dfunc = override_dfunc
        else:
            self._dfunc = download_file
            if skip_existing is True:
                self._dfunc = download_file_skip_existing


[docs]
    def download_images(self, download_list=None):
        """
        Downloads images returning a list of failed downloads

        .. code-block::

            from cellmaps_imagedownloader.runner import MultiProcessImageDownloader

            dloader = MultiProcessImageDownloader(poolsize=2)

            d_list = [('https://images.proteinatlas.org/992/1_A1_1_red.jpg',
                       '/tmp/1_A1_1_red.jpg')]
            failed = dloader.download_images(download_list=d_list)

        :param download_list: Each tuple of format `(image URL, dest file path)`
        :type download_list: list of tuple
        :return: Failed downloads, format of tuple
                 (`http status code`, `text of error`, (`link`, `destfile`))
        :rtype: list of tuple
        """
        failed_downloads = []
        logger.debug('Poolsize for image downloader set to: ' +
                     str(self._poolsize))
        num_to_download = len(download_list)
        logger.info(str(num_to_download) + ' images to download')
        t = tqdm(total=num_to_download, desc='Download',
                 unit='images')
        if self._poolsize <= 1:
            for entry in download_list:
                t.update()
                res = self._dfunc(entry)
                if res is not None:
                    if logger.isEnabledFor(logging.DEBUG):
                        logger.debug('Failed download: ' + str(res))
                    failed_downloads.append(res)
        else:
            with Pool(processes=self._poolsize) as pool:
                for i in pool.imap_unordered(self._dfunc,
                                             download_list):
                    t.update()
                    if i is not None:
                        if logger.isEnabledFor(logging.DEBUG):
                            logger.debug('Failed download: ' + str(i))
                        failed_downloads.append(i)
        t.close()
        return failed_downloads





[docs]
class CellmapsImageDownloader(object):
    """
    Downloads Immunofluorescent images from
    `Human Protein Atlas <https://www.proteinatlas.org>`__
    storing them in an output directory that is locally
    registered as an `RO-Crate <https://www.researchobject.org/ro-crate>`__

    """

    SAMPLES_FILEKEY = 'samples'
    UNIQUE_FILEKEY = 'unique'
    IMG_SUFFIX = '.jpg'

    def __init__(self, outdir=None,
                 imgsuffix=IMG_SUFFIX,
                 imagedownloader=MultiProcessImageDownloader(),
                 imagegen=None,
                 imageurlgen=None,
                 skip_logging=True,
                 provenance=None,
                 input_data_dict=None,
                 provenance_utils=ProvenanceUtil(),
                 skip_failed=False,
                 existing_outdir=False):
        """
        Constructor

        :param outdir: directory where images will be downloaded to
        :type outdir: str
        :param imgsuffix: suffix to append to image file names
        :type imgsuffix: str
        :param imagedownloader: object that will perform image downloads
        :type imagedownloader: :py:class:`~cellmaps_downloader.runner.ImageDownloader`
        :param imagegen: gene node attribute generator for IF image data
        :type imagegen: :py:class:`~cellmaps_imagedownloader.gene.ImageGeneNodeAttributeGenerator`
        :param imageurlgen: URL generator to get URL to download images for given samples
        :type imageurlgen: :py:class:`~CM4AIImageCopyTupleGenerator`,
                           :py:class:`~ImageDownloadTupleGenerator` or
                           :py:class:`~LinkPrefixImageDownloadTupleGenerator`
        :param skip_logging: If ``True`` skip logging, if ``None`` or ``False`` do NOT skip logging
        :type skip_logging: bool
        :param provenance: Provenance information about input files as dictionary. It should include fields like:
                           name, organisation-name, project-name, and other data related info e.g. cell-line,
                           treatment, release, gene-set.

                           Example:

                           .. code-block:: python

                                {
                                    'name': 'Example input dataset',
                                    'organization-name': 'CM4AI',
                                    'project-name': 'Example',
                                    'samples': {
                                        'name': 'u2os HPA IF images',
                                        'author': 'Author of dataset',
                                        'version': 'Version of dataset',
                                        'date-published': 'Date dataset was published',
                                        'description': 'Description of dataset',
                                        'data-format': 'csv'
                                    }
                                }
        :type provenance: dict
        :param input_data_dict: All attributes and their corresponding values of the input data e.g.

                                .. code-block:: python

                                    {'outdir': 'test', 'samples': 'path/to/file/with/samples'}
        :type input_data_dict: dict
        :param provenance_utils: Wrapper for `fairscape-cli <https://pypi.org/project/fairscape-cli>`__
                                 which is used for
                                 `RO-Crate <https://www.researchobject.org/ro-crate>`__ creation and population
        :type provenance_utils: :py:class:`~cellmaps_utils.provenance.ProvenanceUtil`
        :param skip_failed: If ``True`` allow partial downloads, if ``None`` or ``False`` it will throw exception if
                            some image has not been downloaded.
        :type skip_failed: bool
        :param existing_outdir: If ``True`` will not create an output directory but save results in specified existing
                                output directory, if ``None`` or ``False`` new output directory will be created in the
                                path specified in ``outdir`` param.
        :type existing_outdir: bool
        """
        if outdir is None:
            raise CellMapsImageDownloaderError('outdir is None')
        self._outdir = os.path.abspath(outdir)
        self._imagedownloader = imagedownloader
        self._imgsuffix = imgsuffix
        self._start_time = int(time.time())
        self._end_time = -1
        self._imagegen = imagegen
        self._imageurlgen = imageurlgen
        self._provenance = provenance
        self._input_data_dict = input_data_dict
        if skip_logging is None:
            self._skip_logging = False
        else:
            self._skip_logging = skip_logging
        self._samples_datasetid = None
        self._unique_datasetid = None
        self._softwareid = None
        self._image_gene_attrid = None
        self._provenance_utils = provenance_utils
        self._skip_failed = skip_failed
        self._image_dataset_ids = None
        self._existing_outdir = existing_outdir

        if self._input_data_dict is None:
            self._input_data_dict = {'outdir': self._outdir,
                                     'imgsuffix': self._imgsuffix,
                                     'imagedownloader': str(self._imagedownloader),
                                     'imagegen': str(self._imagegen),
                                     'imageurlgen': str(self._imageurlgen),
                                     'skip_logging': self._skip_logging,
                                     'provenance': str(self._provenance)
                                     }


[docs]
    @staticmethod
    def get_example_provenance(requiredonly=True,
                               with_ids=False):
        """
        Gets a dict of provenance parameters needed to add/register
        a dataset with FAIRSCAPE

        :param requiredonly: If ``True`` only output required fields,
                             otherwise output all fields. This value
                             is ignored if **with_ids** is ``True``
        :type requiredonly: bool
        :param with_ids: If ``True`` only output the fields
                         to set dataset guids and ignore value of
                         **requiredonly** parameter.
        :type with_ids: bool
        :return:
        """
        base_dict = {'name': 'Name for pipeline run',
                     'organization-name': 'Name of lab or group. Ex: Ideker',
                     'project-name': 'Name of funding source or project',
                     'cell-line': 'Name of cell line. Ex: U2OS',
                     'treatment': 'Name of treatment, Ex: untreated',
                     'release': 'Name of release. Example: 0.1 alpha',
                     'gene-set': 'Name of gene set. Example chromatin'}
        if with_ids is not None and with_ids is True:
            guid_dict = ProvenanceUtil.example_dataset_provenance(with_ids=with_ids)
            base_dict.update({CellmapsImageDownloader.SAMPLES_FILEKEY: guid_dict,
                              CellmapsImageDownloader.UNIQUE_FILEKEY: guid_dict})
            return base_dict

        field_dict = ProvenanceUtil.example_dataset_provenance(requiredonly=requiredonly)

        base_dict.update({CellmapsImageDownloader.SAMPLES_FILEKEY: field_dict,
                          CellmapsImageDownloader.UNIQUE_FILEKEY: field_dict})
        return base_dict


    def _update_provenance_with_keywords(self):
        """
        Generates appropriate keywords from provenance data set in constructor

        :return: keywords as str values
        :rtype: list
        """
        if self._provenance is None:
            logger.warning('Provenance is None')
            return
        keywords = []
        for key in ['organization-name', 'project-name', 'release',
                    'cell-line', 'treatment', 'gene-set', 'name']:
            if key in self._provenance:
                keywords.append(self._provenance[key])
        keywords.extend(['IF microscopy', 'images'])
        self._provenance['keywords'] = keywords

    def _update_provenance_with_description(self):
        """
        Gets description from provenance
        :return:
        """
        if self._provenance is None:
            logger.warning('Provenance is None')
            return
        desc = ''
        for key in ['organization-name', 'project-name', 'release',
                    'cell-line', 'treatment', 'gene-set', 'name']:
            if key in self._provenance:
                if desc != '':
                    desc += ' '
                desc += self._provenance[key]
        self._provenance['description'] = desc + ' IF microscopy images'

    def _create_output_directory(self):
        """
        Creates output directory if it does not already exist

        :raises CellmapsDownloaderError: If output directory is None or if directory already exists
        """
        if not self._existing_outdir:
            if os.path.isdir(self._outdir):
                raise CellMapsImageDownloaderError(self._outdir + ' already exists')
            else:
                os.makedirs(self._outdir, mode=0o755)

        for cur_color in constants.COLORS:
            cdir = os.path.join(self._outdir, cur_color)
            if not os.path.isdir(cdir):
                logger.debug('Creating directory: ' + cdir)
                os.makedirs(cdir,
                            mode=0o755)

    def _register_software(self):
        """
        Registers this tool, self._provenance must exist and have 'keywords' and
        'description' set

        :raises CellMapsProvenanceError: If fairscape call fails
        """
        software_keywords = self._provenance['keywords']
        software_keywords.extend(['tools', cellmaps_imagedownloader.__name__])
        software_description = self._provenance['description'] + \
                               ' ' + \
                               cellmaps_imagedownloader.__description__
        self._softwareid = self._provenance_utils.register_software(self._outdir,
                                                                    name=cellmaps_imagedownloader.__name__,
                                                                    description=software_description,
                                                                    author=cellmaps_imagedownloader.__author__,
                                                                    version=cellmaps_imagedownloader.__version__,
                                                                    file_format='py',
                                                                    keywords=software_keywords,
                                                                    url=cellmaps_imagedownloader.__repo_url__)

    def _register_image_gene_node_attrs(self, fold=1):
        """
        Registers <fold>_image_gene_node_attributes.tsv file with
        ro-crate as a dataset

        :param fold: name of fold
        :type fold: int
        """
        if self._image_gene_attrid is None:
            self._image_gene_attrid = []

        keywords = self._provenance['keywords']
        keywords.extend(['gene', 'attributes', 'file'])
        description = self._provenance['description'] + ' Fold ' + str(fold) + ' Image gene node attributes file'

        data_dict = {'name': cellmaps_imagedownloader.__name__ +
                             ' output file',
                     'description': description,
                     'data-format': 'tsv',
                     'author': cellmaps_imagedownloader.__name__,
                     'version': cellmaps_imagedownloader.__version__,
                     'schema': 'https://raw.githubusercontent.com/fairscape/cm4ai-schemas/main/v0.1.0/cm4ai_schema_imageloader_gene_node_attributes.json',
                     'keywords': keywords,
                     'date-published': date.today().strftime(self._provenance_utils.get_default_date_format_str())}
        src_file = self.get_image_gene_node_attributes_file(fold)
        self._image_gene_attrid.append(self._provenance_utils.register_dataset(self._outdir,
                                                                               source_file=src_file,
                                                                               data_dict=data_dict))

    def _add_dataset_to_crate(self, data_dict=None,
                              source_file=None, skip_copy=True):
        """
        Adds a dataset to rocrate

        :param crate_path: Path to rocrate
        :type crate_path: str
        :param data_dict: information needed to add dataset
        :type data_dict: dict
        :return:
        """
        return self._provenance_utils.register_dataset(self._outdir,
                                                       source_file=source_file,
                                                       data_dict=data_dict,
                                                       skip_copy=skip_copy)

    def _register_computation(self):
        """
        Registers computation with rocrate.
        Current implementation only registers the 1st 2000 images

        """
        if self._image_gene_attrid is not None:
            generated = self._image_gene_attrid
        else:
            generated = []
        if self._image_dataset_ids is not None:
            generated.extend(self._image_dataset_ids)

        keywords = self._provenance['keywords']
        keywords.extend(['computation', 'download'])
        description = self._provenance['description'] + ' run of ' + cellmaps_imagedownloader.__name__

        self._provenance_utils.register_computation(self._outdir,
                                                    name=cellmaps_imagedownloader.__computation_name__,
                                                    run_by=str(self._provenance_utils.get_login()),
                                                    command=str(self._input_data_dict),
                                                    description=description,
                                                    keywords=keywords,
                                                    used_software=[self._softwareid],
                                                    used_dataset=[self._unique_datasetid, self._samples_datasetid],
                                                    generated=generated)

    def _create_run_crate(self):
        """
        Creates rocrate for output directory

        :raises CellMapsProvenanceError: If there is an error
        """
        try:
            self._provenance_utils.register_rocrate(self._outdir,
                                                    name=self._provenance['name'],
                                                    organization_name=self._provenance['organization-name'],
                                                    project_name=self._provenance['project-name'],
                                                    description=self._provenance['description'],
                                                    keywords=self._provenance['keywords'])
        except TypeError as te:
            raise CellMapsImageDownloaderError('Invalid provenance: ' + str(te))
        except KeyError as ke:
            raise CellMapsImageDownloaderError('Key missing in provenance: ' + str(ke))

    def _register_samples_dataset(self):
        """

        :return:
        """
        if 'guid' in self._provenance[CellmapsImageDownloader.SAMPLES_FILEKEY]:
            self._samples_datasetid = self._provenance[CellmapsImageDownloader.SAMPLES_FILEKEY]['guid']
            return

        # if input file for samples was not set then write the samples we
        # have to the output directory and use that path as dataset to register
        if self._input_data_dict is None or \
            CellmapsImageDownloader.SAMPLES_FILEKEY not in self._input_data_dict or \
            self._input_data_dict[CellmapsImageDownloader.SAMPLES_FILEKEY] is None:
            logger.debug('no samples passed in, just write out copy to output directory')
            samples_file = os.path.join(self._outdir, 'samplescopy.csv')
            self._imagegen.write_samples_to_csvfile(csvfile=samples_file)
            skip_samples_copy = True
        else:
            samples_file = self._input_data_dict[CellmapsImageDownloader.SAMPLES_FILEKEY]
            skip_samples_copy = False
            if os.path.exists(os.path.join(self._outdir, 'samples.csv')):
                skip_samples_copy = True

        # add samples dataset
        self._samples_datasetid = self._add_dataset_to_crate(
            data_dict=self._provenance[CellmapsImageDownloader.SAMPLES_FILEKEY],
            source_file=os.path.abspath(samples_file),
            skip_copy=skip_samples_copy)
        logger.debug('Samples dataset id: ' + str(self._samples_datasetid))

    def _register_unique_dataset(self):
        """

        :return:
        """
        if self._imagegen is None or self._imagegen.get_unique_list() is None:
            return

        if 'guid' in self._provenance[CellmapsImageDownloader.UNIQUE_FILEKEY]:
            self._unique_datasetid = self._provenance[CellmapsImageDownloader.UNIQUE_FILEKEY]['guid']
            return

        # if input file for unique list was not set then write the unique list we
        # have to the output directory and use that path as dataset to register
        if self._input_data_dict is None or \
            CellmapsImageDownloader.UNIQUE_FILEKEY not in self._input_data_dict or \
            self._input_data_dict[CellmapsImageDownloader.UNIQUE_FILEKEY] is None:
            unique_file = os.path.join(self._outdir, 'uniquecopy.csv')
            self._imagegen.write_unique_list_to_csvfile(csvfile=unique_file)
            skip_unique_copy = True
        else:
            unique_file = self._input_data_dict[CellmapsImageDownloader.UNIQUE_FILEKEY]
            skip_unique_copy = False

        # add unique dataset
        self._unique_datasetid = self._add_dataset_to_crate(
            data_dict=self._provenance[CellmapsImageDownloader.UNIQUE_FILEKEY],
            source_file=os.path.abspath(unique_file),
            skip_copy=skip_unique_copy)
        logger.debug('Unique dataset id: ' + str(self._unique_datasetid))

    def _register_downloaded_images(self):
        """
        Registers all the downloaded images
        :return:
        """
        data_dict = {'name': cellmaps_imagedownloader.__name__ + ' downloaded image file',
                     'description': self._provenance['description'] + ' IF image file',
                     'data-format': self._imgsuffix,
                     'author': '???',
                     'version': '???',
                     'date-published': date.today().strftime(self._provenance_utils.get_default_date_format_str())}

        self._image_dataset_ids = []

        for c in constants.COLORS:
            cntr = 0
            for entry in tqdm(os.listdir(os.path.join(self._outdir, c)),
                              desc='FAIRSCAPE ' + c + ' images registration'):
                if not entry.endswith(self._imgsuffix):
                    continue
                fullpath = os.path.join(self._outdir, c, entry)
                data_dict['name'] = entry + ' ' + c + \
                                    ' channel image'
                if len(data_dict['name']) >= 64:
                    data_dict['name'] = data_dict['name'][:63]

                data_dict['keywords'] = [c, 'IF', 'image']
                self._image_dataset_ids.append(self._add_dataset_to_crate(data_dict=data_dict,
                                                                          source_file=fullpath,
                                                                          skip_copy=True))
                cntr += 1
                if cntr > 25:
                    # Todo: https://github.com/fairscape/fairscape-cli/issues/9
                    logger.error('FAIRSCAPE cannot handle too many images, skipping rest')
                    break

    def _get_color_download_map(self):
        """
        Creates a dict where key is color name and value is directory
        path for files for that color

        ``{'red': '/tmp/foo/red'}``

        :return: map of colors to directory paths
        :rtype: dict
        """
        color_d_map = {}
        for c in constants.COLORS:
            color_d_map[c] = os.path.join(self._outdir, c)
        return color_d_map

    def _get_download_tuples(self):
        """
        Gets download list from **imageurlgen** object set via constructor

        :return: list of (image download URL prefix,
                          file path where image should be written)
        :rtype: list
        """
        dtuples = []
        color_map = self._get_color_download_map()
        for image_url, image_dest in self._imageurlgen.get_next_image_url(color_map):
            dtuples.append((image_url, image_dest))
        return dtuples

    def _write_task_start_json(self):
        """
        Writes task_start.json file with information about
        what is to be run

        """
        data = {'image_downloader': str(self._imagedownloader),
                'image_suffix': self._imgsuffix}

        if self._input_data_dict is not None:
            data.update({'commandlineargs': self._input_data_dict})

        logutils.write_task_start_json(outdir=self._outdir,
                                       start_time=self._start_time,
                                       version=cellmaps_imagedownloader.__version__,
                                       data=data)

    def _retry_failed_images(self, failed_downloads=None):
        """

        :param failed_downloads:
        :return:
        """
        downloads_to_retry = []
        error_code_map = {}
        for entry in failed_downloads:
            if entry[0] not in error_code_map:
                error_code_map[entry[0]] = 0
            error_code_map[entry[0]] += 1
            downloads_to_retry.append(entry[2])
        logger.debug('Failed download counts by http error code: ' + str(error_code_map))
        return self._imagedownloader.download_images(downloads_to_retry)

    def _download_images(self, max_retry=5):
        """
        Uses downloader specified in constructor to download images noted in
        tsvfile file also specified in constructor

        :raises CellMapsImageDownloaderError: if image downloader is ``None`` or
                                         if there are failed downloads
        :return: (int with value of 0 upon success otherwise failure, list of failed downloads)
        :rtype: tuple
        """
        if self._imagedownloader is None:
            raise CellMapsImageDownloaderError('Image downloader is None')

        downloadtuples = self._get_download_tuples()

        failed_downloads = self._imagedownloader.download_images(downloadtuples)
        retry_count = 0
        while len(failed_downloads) > 0 and retry_count < max_retry:
            retry_count += 1
            logger.error(str(len(failed_downloads)) +
                         ' images failed to download. Retrying #' + str(retry_count))

            # try one more time with files that failed
            failed_downloads = self._retry_failed_images(failed_downloads=failed_downloads)

        if len(failed_downloads) > 0 and (self._skip_failed is None or self._skip_failed is False):
            raise CellMapsImageDownloaderError('Failed to download: ' +
                                               str(len(failed_downloads)) + ' images')
        return 0, failed_downloads


[docs]
    def get_image_gene_node_attributes_file(self, fold):
        """
        Gets full path to image gene node attribute file under output directory
        created when invoking :py:meth:`~cellmaps_imagedownloader.runner.CellmapsImageDownloader.run`

        :return: Path to file
        :rtype: str
        """
        return os.path.join(self._outdir, str(fold) + '_' +
                            constants.IMAGE_GENE_NODE_ATTR_FILE)



[docs]
    def get_image_gene_node_errors_file(self):
        """
        Gets full path to image gene node attribute errors file under output directory
        created when invoking :py:meth:`~cellmaps_imagedownloader.runner.CellmapsImageDownloader.run`

        :return: Path to file
        :rtype: str
        """
        return os.path.join(self._outdir,
                            constants.IMAGE_GENE_NODE_ERRORS_FILE)


    def _write_image_gene_node_attrs(self, gene_node_attrs=None, fold=1,
                                     errors=None):
        """

        :param gene_node_attrs:
        :param errors:
        :return:
        """
        with open(self.get_image_gene_node_attributes_file(fold), 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=constants.IMAGE_GENE_NODE_COLS, delimiter='\t')
            writer.writeheader()
            for key in gene_node_attrs:
                writer.writerow(gene_node_attrs[key])
        if errors is not None:
            with open(self.get_image_gene_node_errors_file(), 'w') as f:
                for e in errors:
                    f.write(str(e) + '\n')

    def _add_imageurl_to_gene_node_attrs(self, gene_node_attrs=None):
        """
        Adds imageurl to **gene_node_attrs** passed in

        :param gene_node_attrs:
        :type gene_node_attrs: dict
        """
        sample_urlmap = self._imageurlgen.get_sample_urlmap()
        for key in gene_node_attrs:
            sample = gene_node_attrs[key]
            image_id = re.sub('^HPA0*|^CAB0*', '', sample['antibody']) + '/' + \
                       sample['filename']
            if image_id in sample_urlmap:
                if sample_urlmap[image_id].startswith('http'):
                    gene_node_attrs[key][constants.IMAGE_GENE_NODE_IMAGEURL_COL] = sample_urlmap[image_id]
                else:
                    gene_node_attrs[key][constants.IMAGE_GENE_NODE_IMAGEURL_COL] = 'no image url found'
            else:
                # this should NOT happen, but just in case
                logger.error(image_id + ' not in sample urlmap. setting to no image url found')
                gene_node_attrs[key][constants.IMAGE_GENE_NODE_IMAGEURL_COL] = 'no image url found'


[docs]
    def generate_readme(self):
        description = getattr(cellmaps_imagedownloader, '__description__', 'No description provided.')
        version = getattr(cellmaps_imagedownloader, '__version__', '0.0.0')

        with open(os.path.join(os.path.dirname(__file__), 'readme_outputs.txt'), 'r') as f:
            readme_outputs = f.read()

        readme = readme_outputs.format(DESCRIPTION=description, VERSION=version)
        with open(os.path.join(self._outdir, 'README.txt'), 'w') as f:
            f.write(readme)



[docs]
    def run(self):
        """
        Downloads images to output directory specified in constructor
        using tsvfile for list of images to download

        :raises CellMapsImageDownloaderError: If there is an error
        :return: 0 upon success, otherwise failure
        """
        exitcode = 99
        try:
            self._create_output_directory()
            if self._skip_logging is False:
                logutils.setup_filelogger(outdir=self._outdir,
                                          handlerprefix='cellmaps_imagedownloader')
            self._write_task_start_json()

            self.generate_readme()

            self._update_provenance_with_description()
            self._update_provenance_with_keywords()
            self._create_run_crate()
            self._register_samples_dataset()
            self._register_unique_dataset()

            self._register_software()

            exitcode, failed_downloads = self._download_images()
            # todo need to validate downloaded image data

            # Remove entries from samples that lack a download URL
            self._imagegen.filter_samples_by_sample_urlmap(self._imageurlgen.get_sample_urlmap())

            # write image attribute data
            for fold in [1, 2]:
                image_gene_node_attrs, errors = self._imagegen.get_gene_node_attributes(fold)
                self._add_imageurl_to_gene_node_attrs(gene_node_attrs=image_gene_node_attrs)
                # write image attribute data
                self._write_image_gene_node_attrs(image_gene_node_attrs, fold, errors)

                self._register_image_gene_node_attrs(fold)

            self._register_downloaded_images()

            self._register_computation()

            return exitcode
        finally:
            self._end_time = int(time.time())
            # write a task finish file
            logutils.write_task_finish_json(outdir=self._outdir,
                                            start_time=self._start_time,
                                            end_time=self._end_time,
                                            status=exitcode)