Added support for uploads to S3 by using the boto3 client.

2018-03-09 16:48:34 +01:00 · 2018-03-09 16:48:34 +01:00 · ea832cd666
commit ea832cd666
parent 4a0673918d
4 changed files with 207 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -9,6 +9,23 @@ Blender Asset Tracer, a.k.a. BAT🦇, is the replacement of
 Development is driven by choices explained in [T54125](https://developer.blender.org/T54125).
 ## Uploading to S3-compatible storage
 BAT Pack supports uploading to S3-compatible storage. This requires a credentials file in
 `~/.aws/credentials`. Replace the all-capital words to suit your situation.
    [ENDPOINT]
    aws_access_key_id = YOUR_ACCESS_KEY_ID
    aws_secret_access_key = YOUR_SECRET
 You can then send a BAT Pack to the storage using a target `s3:/endpoint/bucketname/path-in-bucket`, for example:
    bat pack my_blendfile.blend s3:/storage.qarnot.com/jobs/awesome_work
 This will upload the blend file and its dependencies to `awesome_work/my_blendfile.blend` in
 the `jobs` bucket.
 ## Paths
 There are two object types used to represent file paths. Those are strictly separated.
--- a/blender_asset_tracer/cli/pack.py
+++ b/blender_asset_tracer/cli/pack.py
@ -30,9 +30,9 @@ def add_parser(subparsers):
 def cli_pack(args):
    bpath, ppath, tpath = paths_from_cli(args)
    packer = pack.Packer(bpath, ppath, tpath, args.noop)
    packer.strategise()
    packer = create_packer(args, bpath, ppath, tpath)
    packer.strategise()
    try:
        packer.execute()
    except blender_asset_tracer.pack.transfer.FileTransferError as ex:
@ -41,6 +41,25 @@ def cli_pack(args):
        raise SystemExit(1)
 def create_packer(args, bpath: pathlib.Path, ppath: pathlib.Path,
                  tpath: pathlib.Path) -> pack.Packer:
    if str(tpath).startswith('s3:/'):
        if args.noop:
            raise ValueError('S3 uploader does not support no-op.')
        from blender_asset_tracer.pack import s3
        # Split the target path into 's3:/', hostname, and actual target path
        parts = tpath.parts
        endpoint = parts[1]
        tpath = pathlib.Path(*tpath.parts[2:])
        log.info('Uploading to S3-compatible storage %s at %s', endpoint, tpath)
        return s3.S3Packer(bpath, ppath, tpath, endpoint=endpoint)
    return pack.Packer(bpath, ppath, tpath, args.noop)
 def paths_from_cli(args) -> typing.Tuple[pathlib.Path, pathlib.Path, pathlib.Path]:
    """Return paths to blendfile, project, and pack target.
--- a/blender_asset_tracer/pack/s3.py
+++ b/blender_asset_tracer/pack/s3.py
@ -0,0 +1,166 @@
 """Amazon S3-compatible uploader."""
 import typing
 import hashlib
 import logging
 import pathlib
 import threading
 from . import Packer, transfer
 log = logging.getLogger(__name__)
 def compute_md5(filepath: pathlib.Path) -> str:
    hasher = hashlib.md5()
    with filepath.open('rb') as infile:
        while True:
            block = infile.read(10240)
            if not block:
                break
            hasher.update(block)
    md5 = hasher.hexdigest()
    return md5
 class S3Packer(Packer):
    """Creates BAT Packs on S3-compatible storage."""
    def __init__(self, *args, endpoint, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        import boto3
        # Create a session so that credentials can be read from the [endpoint]
        # section in ~/.aws/credentials.
        # See https://boto3.readthedocs.io/en/latest/guide/configuration.html#guide-configuration
        self.session = boto3.Session(profile_name=endpoint)
        self.client = self.session.client('s3', endpoint_url='https://%s' % endpoint)
    def set_credentials(self,
                        endpoint: str,
                        access_key_id: str,
                        secret_access_key: str):
        """Set S3 credentials."""
        import boto3
        self.client = boto3.client('s3',
                                   endpoint_url=endpoint,
                                   aws_access_key_id=access_key_id,
                                   aws_secret_access_key=secret_access_key)
    def _create_file_transferer(self) -> transfer.FileTransferer:
        return S3Transferrer(self.client)
 class S3Transferrer(threading.Thread, transfer.FileTransferer):
    """Copies or moves files in source directory order."""
    class AbortUpload(Exception):
        """Raised from the upload callback to abort an upload."""
    def __init__(self, botoclient) -> None:
        # Stupid Thread.__init__ doesn't call super().__init__(),
        # so it doesn't get chained to transfer.FileTransferer.__init__().
        # However, I want to have Thread as first subclass so that its
        # start() and join() methods Just Work™.
        threading.Thread.__init__(self)
        transfer.FileTransferer.__init__(self)
        self.client = botoclient
    def run(self) -> None:
        files_transferred = 0
        files_skipped = 0
        for src, dst, act in self.iter_queue():
            try:
                did_upload = self.upload_file(src, dst)
                files_transferred += did_upload
                files_skipped += not did_upload
                if act == transfer.Action.MOVE:
                    self.delete_file(src)
            except Exception:
                # We have to catch exceptions in a broad way, as this is running in
                # a separate thread, and exceptions won't otherwise be seen.
                log.exception('Error transferring %s to %s', src, dst)
                # Put the files to copy back into the queue, and abort. This allows
                # the main thread to inspect the queue and see which files were not
                # copied. The one we just failed (due to this exception) should also
                # be reported there.
                self.queue.put((src, dst, act))
                return
        if files_transferred:
            log.info('Transferred %d files', files_transferred)
        if files_skipped:
            log.info('Skipped %d files', files_skipped)
    def upload_file(self, src: pathlib.Path, dst: pathlib.Path) -> bool:
        """Upload a file to an S3 bucket.
        The first part of 'dst' is used as the bucket name, the remained as the
        path inside the bucket.
        :returns: True if the file was uploaded, False if it was skipped.
        """
        bucket = dst.parts[0]
        dst_path = pathlib.Path(*dst.parts[1:])
        md5 = compute_md5(src)
        key = str(dst_path)
        existing_md5, existing_size = self.get_metadata(bucket, key)
        if md5 == existing_md5 and src.stat().st_size == existing_size:
            log.debug('skipping %s, it already exists on the server with MD5 %s',
                      src, existing_md5)
            return False
        # TODO(Sybren): when queueing files inspect their size, and have a
        # callback that reports the total progress.
        log.info('Uploading %s', src)
        try:
            self.client.upload_file(str(src),
                                    Bucket=bucket,
                                    Key=key,
                                    Callback=self._upload_callback,
                                    ExtraArgs={'Metadata': {'md5': md5}})
        except self.AbortUpload:
            return False
        return True
    def _upload_callback(self, bytes_uploaded: int):
        if self.abort.is_set():
            log.warning('Interrupting ongoing upload')
            raise self.AbortUpload('interrupting ongoing upload')
    def delete_file(self, path: pathlib.Path):
        """Deletes a file, only logging a warning if deletion fails."""
        log.debug('Deleting %s, file has been uploaded', path)
        try:
            path.unlink()
        except IOError as ex:
            log.warning('Unable to delete %s: %s', path, ex)
    def get_metadata(self, bucket: str, key: str) -> typing.Tuple[str, int]:
        """Get MD5 sum and size on S3.
        :returns: the MD5 hexadecimal hash and the file size in bytes.
            If the file does not exist or has no known MD5 sum,
            returns ('', -1)
        """
        import botocore.exceptions
        try:
            info = self.client.head_object(Bucket=bucket, Key=key)
        except botocore.exceptions.ClientError as ex:
            error_code = ex.response.get('Error').get('Code', 'Unknown')
            # error_code already is a string, but this makes the code forward
            # compatible with a time where they use integer codes.
            if str(error_code) == '404':
                return '', -1
            raise ValueError('error response:' % ex.response) from None
        try:
            return info['Metadata']['md5'], info['ContentLength']
        except KeyError:
            return '', -1
--- a/setup.py
+++ b/setup.py
@ -39,5 +39,8 @@ setup(
            'bat = blender_asset_tracer.cli:cli_main',
        ],
    },
    extras_require={
        's3': ['boto3'],
    },
    zip_safe=True,
 )