26
26
from google .api_core import exceptions
27
27
from google .cloud .storage import Client
28
28
from google .cloud .storage import Blob
29
+ from google .cloud .storage .blob import _get_host_name
30
+ from google .cloud .storage .constants import _DEFAULT_TIMEOUT
31
+
32
+ from google .resumable_media .requests .upload import XMLMPUContainer
33
+ from google .resumable_media .requests .upload import XMLMPUPart
34
+
29
35
30
36
warnings .warn (
31
37
"The module `transfer_manager` is a preview feature. Functionality and API "
35
41
36
42
TM_DEFAULT_CHUNK_SIZE = 32 * 1024 * 1024
37
43
DEFAULT_MAX_WORKERS = 8
38
-
44
+ METADATA_HEADER_TRANSLATION = {
45
+ "cacheControl" : "Cache-Control" ,
46
+ "contentDisposition" : "Content-Disposition" ,
47
+ "contentEncoding" : "Content-Encoding" ,
48
+ "contentLanguage" : "Content-Language" ,
49
+ "customTime" : "x-goog-custom-time" ,
50
+ "storageClass" : "x-goog-storage-class" ,
51
+ }
39
52
40
53
# Constants to be passed in as `worker_type`.
41
54
PROCESS = "process"
@@ -198,7 +211,7 @@ def upload_many(
198
211
futures .append (
199
212
executor .submit (
200
213
_call_method_on_maybe_pickled_blob ,
201
- _pickle_blob (blob ) if needs_pickling else blob ,
214
+ _pickle_client (blob ) if needs_pickling else blob ,
202
215
"upload_from_filename"
203
216
if isinstance (path_or_file , str )
204
217
else "upload_from_file" ,
@@ -343,7 +356,7 @@ def download_many(
343
356
futures .append (
344
357
executor .submit (
345
358
_call_method_on_maybe_pickled_blob ,
346
- _pickle_blob (blob ) if needs_pickling else blob ,
359
+ _pickle_client (blob ) if needs_pickling else blob ,
347
360
"download_to_filename"
348
361
if isinstance (path_or_file , str )
349
362
else "download_to_file" ,
@@ -733,7 +746,6 @@ def download_chunks_concurrently(
733
746
Checksumming (md5 or crc32c) is not supported for chunked operations. Any
734
747
`checksum` parameter passed in to download_kwargs will be ignored.
735
748
736
- :type bucket: 'google.cloud.storage.bucket.Bucket'
737
749
:param bucket:
738
750
The bucket which contains the blobs to be downloaded
739
751
@@ -745,6 +757,12 @@ def download_chunks_concurrently(
745
757
:param filename:
746
758
The destination filename or path.
747
759
760
+ :type chunk_size: int
761
+ :param chunk_size:
762
+ The size in bytes of each chunk to send. The optimal chunk size for
763
+ maximum throughput may vary depending on the exact network environment
764
+ and size of the blob.
765
+
748
766
:type download_kwargs: dict
749
767
:param download_kwargs:
750
768
A dictionary of keyword arguments to pass to the download method. Refer
@@ -809,7 +827,7 @@ def download_chunks_concurrently(
809
827
810
828
pool_class , needs_pickling = _get_pool_class_and_requirements (worker_type )
811
829
# Pickle the blob ahead of time (just once, not once per chunk) if needed.
812
- maybe_pickled_blob = _pickle_blob (blob ) if needs_pickling else blob
830
+ maybe_pickled_blob = _pickle_client (blob ) if needs_pickling else blob
813
831
814
832
futures = []
815
833
@@ -844,9 +862,249 @@ def download_chunks_concurrently(
844
862
return None
845
863
846
864
865
+ def upload_chunks_concurrently (
866
+ filename ,
867
+ blob ,
868
+ content_type = None ,
869
+ chunk_size = TM_DEFAULT_CHUNK_SIZE ,
870
+ deadline = None ,
871
+ worker_type = PROCESS ,
872
+ max_workers = DEFAULT_MAX_WORKERS ,
873
+ * ,
874
+ checksum = "md5" ,
875
+ timeout = _DEFAULT_TIMEOUT ,
876
+ ):
877
+ """Upload a single file in chunks, concurrently.
878
+
879
+ This function uses the XML MPU API to initialize an upload and upload a
880
+ file in chunks, concurrently with a worker pool.
881
+
882
+ The XML MPU API is significantly different from other uploads; please review
883
+ the documentation at https://cloud.google.com/storage/docs/multipart-uploads
884
+ before using this feature.
885
+
886
+ The library will attempt to cancel uploads that fail due to an exception.
887
+ If the upload fails in a way that precludes cancellation, such as a
888
+ hardware failure, process termination, or power outage, then the incomplete
889
+ upload may persist indefinitely. To mitigate this, set the
890
+ `AbortIncompleteMultipartUpload` with a nonzero `Age` in bucket lifecycle
891
+ rules, or refer to the XML API documentation linked above to learn more
892
+ about how to list and delete individual downloads.
893
+
894
+ Using this feature with multiple threads is unlikely to improve upload
895
+ performance under normal circumstances due to Python interpreter threading
896
+ behavior. The default is therefore to use processes instead of threads.
897
+
898
+ ACL information cannot be sent with this function and should be set
899
+ separately with :class:`ObjectACL` methods.
900
+
901
+ :type filename: str
902
+ :param filename:
903
+ The path to the file to upload. File-like objects are not supported.
904
+
905
+ :type blob: `google.cloud.storage.Blob`
906
+ :param blob:
907
+ The blob to which to upload.
908
+
909
+ :type content_type: str
910
+ :param content_type: (Optional) Type of content being uploaded.
911
+
912
+ :type chunk_size: int
913
+ :param chunk_size:
914
+ The size in bytes of each chunk to send. The optimal chunk size for
915
+ maximum throughput may vary depending on the exact network environment
916
+ and size of the blob. The remote API has restrictions on the minimum
917
+ and maximum size allowable, see: https://cloud.google.com/storage/quotas#requests
918
+
919
+ :type deadline: int
920
+ :param deadline:
921
+ The number of seconds to wait for all threads to resolve. If the
922
+ deadline is reached, all threads will be terminated regardless of their
923
+ progress and concurrent.futures.TimeoutError will be raised. This can be
924
+ left as the default of None (no deadline) for most use cases.
925
+
926
+ :type worker_type: str
927
+ :param worker_type:
928
+ The worker type to use; one of google.cloud.storage.transfer_manager.PROCESS
929
+ or google.cloud.storage.transfer_manager.THREAD.
930
+
931
+ Although the exact performance impact depends on the use case, in most
932
+ situations the PROCESS worker type will use more system resources (both
933
+ memory and CPU) and result in faster operations than THREAD workers.
934
+
935
+ Because the subprocesses of the PROCESS worker type can't access memory
936
+ from the main process, Client objects have to be serialized and then
937
+ recreated in each subprocess. The serialization of the Client object
938
+ for use in subprocesses is an approximation and may not capture every
939
+ detail of the Client object, especially if the Client was modified after
940
+ its initial creation or if `Client._http` was modified in any way.
941
+
942
+ THREAD worker types are observed to be relatively efficient for
943
+ operations with many small files, but not for operations with large
944
+ files. PROCESS workers are recommended for large file operations.
945
+
946
+ :type max_workers: int
947
+ :param max_workers:
948
+ The maximum number of workers to create to handle the workload.
949
+
950
+ With PROCESS workers, a larger number of workers will consume more
951
+ system resources (memory and CPU) at once.
952
+
953
+ How many workers is optimal depends heavily on the specific use case,
954
+ and the default is a conservative number that should work okay in most
955
+ cases without consuming excessive resources.
956
+
957
+ :type checksum: str
958
+ :param checksum:
959
+ (Optional) The checksum scheme to use: either 'md5', 'crc32c' or None.
960
+ Each individual part is checksummed. At present, the selected checksum
961
+ rule is only applied to parts and a separate checksum of the entire
962
+ resulting blob is not computed. Please compute and compare the checksum
963
+ of the file to the resulting blob separately if needed, using the
964
+ 'crc32c' algorithm as per the XML MPU documentation.
965
+
966
+ :type timeout: float or tuple
967
+ :param timeout:
968
+ (Optional) The amount of time, in seconds, to wait
969
+ for the server response. See: :ref:`configuring_timeouts`
970
+
971
+ :raises: :exc:`concurrent.futures.TimeoutError` if deadline is exceeded.
972
+ """
973
+
974
+ bucket = blob .bucket
975
+ client = blob .client
976
+ transport = blob ._get_transport (client )
977
+
978
+ hostname = _get_host_name (client ._connection )
979
+ url = "{hostname}/{bucket}/{blob}" .format (
980
+ hostname = hostname , bucket = bucket .name , blob = blob .name
981
+ )
982
+
983
+ base_headers , object_metadata , content_type = blob ._get_upload_arguments (
984
+ client , content_type , filename = filename
985
+ )
986
+ headers = {** base_headers , ** _headers_from_metadata (object_metadata )}
987
+
988
+ if blob .user_project is not None :
989
+ headers ["x-goog-user-project" ] = blob .user_project
990
+
991
+ # When a Customer Managed Encryption Key is used to encrypt Cloud Storage object
992
+ # at rest, object resource metadata will store the version of the Key Management
993
+ # Service cryptographic material. If a Blob instance with KMS Key metadata set is
994
+ # used to upload a new version of the object then the existing kmsKeyName version
995
+ # value can't be used in the upload request and the client instead ignores it.
996
+ if blob .kms_key_name is not None and "cryptoKeyVersions" not in blob .kms_key_name :
997
+ headers ["x-goog-encryption-kms-key-name" ] = blob .kms_key_name
998
+
999
+ container = XMLMPUContainer (url , filename , headers = headers )
1000
+ container .initiate (transport = transport , content_type = content_type )
1001
+ upload_id = container .upload_id
1002
+
1003
+ size = os .path .getsize (filename )
1004
+ num_of_parts = - (size // - chunk_size ) # Ceiling division
1005
+
1006
+ pool_class , needs_pickling = _get_pool_class_and_requirements (worker_type )
1007
+ # Pickle the blob ahead of time (just once, not once per chunk) if needed.
1008
+ maybe_pickled_client = _pickle_client (client ) if needs_pickling else client
1009
+
1010
+ futures = []
1011
+
1012
+ with pool_class (max_workers = max_workers ) as executor :
1013
+
1014
+ for part_number in range (1 , num_of_parts + 1 ):
1015
+ start = (part_number - 1 ) * chunk_size
1016
+ end = min (part_number * chunk_size , size )
1017
+
1018
+ futures .append (
1019
+ executor .submit (
1020
+ _upload_part ,
1021
+ maybe_pickled_client ,
1022
+ url ,
1023
+ upload_id ,
1024
+ filename ,
1025
+ start = start ,
1026
+ end = end ,
1027
+ part_number = part_number ,
1028
+ checksum = checksum ,
1029
+ headers = headers ,
1030
+ )
1031
+ )
1032
+
1033
+ concurrent .futures .wait (
1034
+ futures , timeout = deadline , return_when = concurrent .futures .ALL_COMPLETED
1035
+ )
1036
+
1037
+ try :
1038
+ # Harvest results and raise exceptions.
1039
+ for future in futures :
1040
+ part_number , etag = future .result ()
1041
+ container .register_part (part_number , etag )
1042
+
1043
+ container .finalize (blob ._get_transport (client ))
1044
+ except Exception :
1045
+ container .cancel (blob ._get_transport (client ))
1046
+ raise
1047
+
1048
+
1049
+ def _upload_part (
1050
+ maybe_pickled_client ,
1051
+ url ,
1052
+ upload_id ,
1053
+ filename ,
1054
+ start ,
1055
+ end ,
1056
+ part_number ,
1057
+ checksum ,
1058
+ headers ,
1059
+ ):
1060
+ """Helper function that runs inside a thread or subprocess to upload a part.
1061
+
1062
+ `maybe_pickled_client` is either a Client (for threads) or a specially
1063
+ pickled Client (for processes) because the default pickling mangles Client
1064
+ objects."""
1065
+
1066
+ if isinstance (maybe_pickled_client , Client ):
1067
+ client = maybe_pickled_client
1068
+ else :
1069
+ client = pickle .loads (maybe_pickled_client )
1070
+ part = XMLMPUPart (
1071
+ url ,
1072
+ upload_id ,
1073
+ filename ,
1074
+ start = start ,
1075
+ end = end ,
1076
+ part_number = part_number ,
1077
+ checksum = checksum ,
1078
+ headers = headers ,
1079
+ )
1080
+ part .upload (client ._http )
1081
+ return (part_number , part .etag )
1082
+
1083
+
1084
+ def _headers_from_metadata (metadata ):
1085
+ """Helper function to translate object metadata into a header dictionary."""
1086
+
1087
+ headers = {}
1088
+ # Handle standard writable metadata
1089
+ for key , value in metadata .items ():
1090
+ if key in METADATA_HEADER_TRANSLATION :
1091
+ headers [METADATA_HEADER_TRANSLATION [key ]] = value
1092
+ # Handle custom metadata
1093
+ if "metadata" in metadata :
1094
+ for key , value in metadata ["metadata" ].items ():
1095
+ headers ["x-goog-meta-" + key ] = value
1096
+ return headers
1097
+
1098
+
847
1099
def _download_and_write_chunk_in_place (
848
1100
maybe_pickled_blob , filename , start , end , download_kwargs
849
1101
):
1102
+ """Helper function that runs inside a thread or subprocess.
1103
+
1104
+ `maybe_pickled_blob` is either a Blob (for threads) or a specially pickled
1105
+ Blob (for processes) because the default pickling mangles Client objects
1106
+ which are attached to Blobs."""
1107
+
850
1108
if isinstance (maybe_pickled_blob , Blob ):
851
1109
blob = maybe_pickled_blob
852
1110
else :
@@ -863,9 +1121,9 @@ def _call_method_on_maybe_pickled_blob(
863
1121
):
864
1122
"""Helper function that runs inside a thread or subprocess.
865
1123
866
- `maybe_pickled_blob` is either a blob (for threads) or a specially pickled
867
- blob (for processes) because the default pickling mangles clients which are
868
- attached to blobs ."""
1124
+ `maybe_pickled_blob` is either a Blob (for threads) or a specially pickled
1125
+ Blob (for processes) because the default pickling mangles Client objects
1126
+ which are attached to Blobs ."""
869
1127
870
1128
if isinstance (maybe_pickled_blob , Blob ):
871
1129
blob = maybe_pickled_blob
@@ -894,8 +1152,8 @@ def _reduce_client(cl):
894
1152
)
895
1153
896
1154
897
- def _pickle_blob ( blob ):
898
- """Pickle a Blob (and its Bucket and Client) and return a bytestring. """
1155
+ def _pickle_client ( obj ):
1156
+ """Pickle a Client or an object that owns a Client (like a Blob) """
899
1157
900
1158
# We need a custom pickler to process Client objects, which are attached to
901
1159
# Buckets (and therefore to Blobs in turn). Unfortunately, the Python
@@ -907,7 +1165,7 @@ def _pickle_blob(blob):
907
1165
p = pickle .Pickler (f )
908
1166
p .dispatch_table = copyreg .dispatch_table .copy ()
909
1167
p .dispatch_table [Client ] = _reduce_client
910
- p .dump (blob )
1168
+ p .dump (obj )
911
1169
return f .getvalue ()
912
1170
913
1171
0 commit comments