Source code for esp.models.file

# -*- coding: utf-8 -*-
#
# file-related models.
#
# ------------------------------------------------


# imports
# -------
import base64
import datetime
import json
import os

from gems import cached
import six
from six.moves.urllib.parse import urlparse

import esp.base as base
from .__base__ import LinkedModel, BaseModel


# models
# ------
[docs]class File(LinkedModel):
    """
    Object for interacting with files from the ESP database.

    See the `Usage <./usage.html>`_ and `Examples <./examples.html>`_ pages
    of the documentation for more context and comprehensive examples of
    how to create and use this type of objects.

    Configuration:

        Create file:

        .. code-block:: yaml

            name: My Protocol SOP
            desc: An SOP file
            tags: [sop, protocol]
            uri: relative/path/to/instructions.pdf
            upload: true
            type: PDF

        Create file object with reference to existing file:

        .. code-block:: yaml

            name: Large Sequencing File
            desc: An SOP file
            tags: [sop, protocol]
            uri: file:///path/to/local/file/instructions.bam
            upload: true
            type: raw

        Create file object and register as task output:

        .. code-block:: yaml

            name: SampleSheet.csv
            desc: Illumina Run Sheet
            task_file: Illumina Runsheet
            uri: /path/to/SampleSheet.csv
            upload: false
            type: csv

    Configuration Notes:

        * Due to current backend limitations, `uri` inputs can only take the
          formats: "relative/path", "/absoluate/path", and "file:///absolute/path".

        * If `upload` is true, uri must resolve to an existing file on the
          local file system, otherwise an error will be raised. If upload is
          False, file will only be registered. Upload defaults to false.

        * The `task_file` parameter is primarily useful when registering pipeline output
          files (since you can pass in the task instance UUID).

        * "type" is used internally by ESP and is not necessary a mime type.
          For instance, the mime type of a bam file is generally application/gzip
          or application/octet-stream, but the type in ESP might normally be "bam".

        * mime-type is an acceptable key for specifying the mime-type. If no
          mime-type is provided, the client will attempt to guess the type
          and pass that type along as the content type of the uploaded file.

    Examples:

        .. code-block:: python

            >>> from esp.models import File
            >>> fi = File('My Protocol SOP')
            >>> fi.name, fi.created_at
            ('My Protocol SOP', '2019-06-21T16:04:01.199076Z')

            >>> # show relationships
            >>> fi.path
            '/path/to/Lab7_ESP/Data/files/0000/instructions.pdf.6a523a74-6703-4474-a24e-5a463b9d9770'
            >>> fi.contents
            '... raw file contents ...'
            >>> fi.download('local-copy.pdf')

    Arguments:
        ident (str): Name or uuid for object.
    """

    __api__ = "files"
    __api_cls__ = "Lab7File"
    __allow_update__ = False

    # NOTE: PUSHING META ON FILE OBJECTS NOT SUPPORTED BY BACKEND
    __mutable__ = [x for x in BaseModel.__mutable__ if x != "meta"]

    __exportable__ = BaseModel.__base_exportable__ + [
        "uri",
        "upload",
        "type",
        "mime-type",
    ]

    __export_format__ = {
        "uri": lambda x: x.url,
        "upload": lambda x: True,
        "mime-type": lambda x: x.meta.get("mime-type"),
    }

[docs]    @classmethod
    def parse_import(cls, config, overwrite=False, allow_snapshot_uuid_remap=False):
        """
        Create new object in ESP database using config file or other data.

        Args:
            config (str, dict, list): Config file or information to use in
                creating new object.
            overwrite (bool): Whether or not to delete current entry in
                the ESP database.
        """
        # process uri
        uri = config.pop("uri", None)
        parsed_uri = urlparse(os.path.expandvars(uri), scheme="file")
        if parsed_uri.scheme == "file" and parsed_uri.netloc:
            raise ValueError(
                "File uri specified, but not a valid network location. "
                "Use: relative/path, /absolute/path, or file:///absolute/path. "
                "You used: {}".format(uri)
            )
        config["url"] = parsed_uri.geturl()
        path = parsed_uri.path
        config.setdefault("meta", {})

        # add taskfile dependencies
        task_file = config.pop("task_file", None)
        if task_file is not None:
            # add taskfile dep to existing list
            deps = config.get("deps", [])
            task_deps = [x for x in deps if len(x) == 2 and x[1] == "task_file"]
            if not task_deps:
                task_uuid = os.environ.get("LAB7_TASK_UUID", None)
                if not task_uuid:
                    raise ValueError(
                        "Cannot register a task file while no task is running "
                        "unless you explicitly supply the task instance uuid."
                    )
                deps.append([task_uuid, "task_file"])

            # update tags and meta
            tags, meta = config.get("tags", []), config.get("meta", {})
            if "task_file" not in tags:
                tags.append("task_file")
            meta.update({"taskfile_name": task_file})
            config.update({"meta": meta, "deps": deps, "tags": tags})

        # upload file
        upload = config.pop("upload", False)
        if upload:
            if not (parsed_uri.scheme == "" or parsed_uri.scheme == "file"):
                raise ValueError(
                    "To upload a file, the defined URL needs be a local file location "
                    f'("file://..."). Your location is: {path}'
                )
            # check for existing file
            if not os.path.exists(path):
                raise ValueError(f"Cannot upload missing file: {path}")

            mimetype = config.pop("mime-type", config["meta"].get("mime-type"))
            if not mimetype:
                # attempt to guess the mime type.
                import mimetypes

                mimetype = mimetypes.guess_type(path)[0]
                if not mimetype:
                    # fallback.
                    # TODO: Worth introducing a fallback on magic-string-based
                    # approaches?
                    mimetype = "application/octet-stream"

            # read file and do upload
            with open(path, "rb") as fi:
                files = {"file": (os.path.basename(path), fi, mimetype)}
                config["mode"] = "upload"
                config["tags"] = json.dumps(config.get("tags", []))
                config["meta"] = json.dumps(config.get("meta", {}))

                ctype = base.SESSION.session.headers.pop("Content-Type", None)
                result = base.SESSION.post("/api/files", files=files, data=config)
                if ctype is not None:
                    base.SESSION.session.headers["Content-Type"] = ctype
                return File.from_data(result.json())
        else:
            # todo: There is a freakish case where we can have a scheme other than file
            # in the URL and it still points to a valid file, e. g.
            #   http://something.org/etc/passwd
            # We probably don't care about that at this point, but something to keep in mind.

            # make sure modified_time and created_time are present.
            # note that these are the times of the OS _file_, not the ESP
            # File.
            try:
                stats = os.stat(path)
            except FileNotFoundError as fnf:
                ctime = datetime.datetime.now()
                # note: no proper cross-platform-compatible way to pick up
                # ctime since on posix-compliant FSs, ctime = mtime. But
                # reasonable default behavior that callers can override by
                # supplying meta.
                mtime = ctime
                size = 0
                mimetype = "application/octet-stream"
            else:
                ctime = datetime.datetime.fromtimestamp(stats.st_ctime)
                mtime = datetime.datetime.fromtimestamp(stats.st_mtime)
                size = stats.st_size
                import mimetypes

                mimetype = mimetypes.guess_type(path)[0] or "application/octet-stream"

            if "modified_time" not in config["meta"]:
                config["meta"]["modified_time"] = mtime.isoformat()
            if "created_time" not in config["meta"]:
                config["meta"]["created_time"] = ctime.isoformat()
            if "file_size" not in config["meta"]:
                config["meta"]["file_size"] = size
            if "mime-type" in config:
                mimetype = config.pop("mime-type")
            elif "mime_type" in config:
                mimetype = config.pipe("mime_type")
            if "mime_type" not in config["meta"]:
                config["meta"]["mime_type"] = mimetype
            if "original_filename" not in config["meta"]:
                config["meta"]["original_filename"] = os.path.basename(path)
        # do POST and return object
        config["mode"] = "register"
        result = base.SESSION.post("/api/files", json=config)
        return File.from_data(result.json())

    @cached
    def contents(self):
        """
        Return raw file contents (as string).
        """
        res = base.SESSION.get("/api/files/{}/contents".format(self.uuid))
        return res.json().get("data")

[docs]    def download(self, outfile, encoding="UTF-8"):
        """
        Download file and store as output filename.

        Args:
            outfile (str|file): Path to downloaded file or file object to write to.
            encoding (str): If outfile is a file object in string mode, the string encoding
            to use to decode the binary data from the server.

        Note:
            Fetches the contents in a single go. For large text files where only a subset may be
            needed, consider using the 'contents' endpoint instead.
            For large, streamable binary files, such as BAM files, esp REST APIS have support for
            ranged data retrieval that is not yet supported by the python client.
        """
        contents = base.SESSION.get("/api/files/{}/download".format(self.uuid)).content
        # Bytes with a path: open binary and write.
        if isinstance(outfile, six.string_types):
            with open(outfile, "wb") as file_:
                file_.write(contents)
        # byte mode file object: write it directly.
        elif "b" in outfile.mode:
            outfile.write(contents)
        # string-mode file object: decode to string using provided encoding.
        else:
            contents = contents.decode(encoding)
            outfile.write(contents)

    @cached
    def linkable_uri(self):
        """
        Return a "linkable" URL.

        The returned URL will be relative to the API_SERVER root,
        so external consumers should prepend the API_SERVER information.

        Examples:

            >>> File.create({ 'uri': '/path/to/myfile.txt', 'name': 'myfile' }).linkable_uri()
            /api/files/<:uuid>/static

        This methood is provided to simplify embedding links ESP files in extensions
        and third-party content. Note that the URL format is subject to change
        in the future.
        """
        return "/api/{}/{}/static".format(self.__api__, self.uuid)