|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import shutil |
|
import os |
|
from datetime import timedelta |
|
|
|
import pyarrow as pa |
|
import pyarrow.dataset as ds |
|
import pyarrow.parquet.encryption as pe |
|
from pyarrow.tests.parquet.encryption import InMemoryKmsClient |
|
|
|
""" A sample to demonstrate parquet dataset encryption and decryption""" |
|
|
|
|
|
table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021], |
|
'n_legs': [2, 2, 4, 4, 5, 100], |
|
'animal': ["Flamingo", "Parrot", "Dog", "Horse", |
|
"Brittle stars", "Centipede"]}) |
|
|
|
|
|
dataset = ds.dataset(table) |
|
|
|
FOOTER_KEY = b"0123456789112345" |
|
FOOTER_KEY_NAME = "footer_key" |
|
COL_KEY = b"1234567890123450" |
|
COL_KEY_NAME = "col_key" |
|
|
|
encryption_config = pe.EncryptionConfiguration( |
|
footer_key=FOOTER_KEY_NAME, |
|
plaintext_footer=False, |
|
|
|
column_keys={ |
|
COL_KEY_NAME: ["n_legs", "animal"], |
|
}, |
|
encryption_algorithm="AES_GCM_V1", |
|
|
|
cache_lifetime=timedelta(minutes=5.0), |
|
data_key_length_bits=256) |
|
|
|
kms_connection_config = pe.KmsConnectionConfig( |
|
custom_kms_conf={ |
|
FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"), |
|
COL_KEY_NAME: COL_KEY.decode("UTF-8"), |
|
} |
|
) |
|
|
|
decryption_config = pe.DecryptionConfiguration(cache_lifetime=300) |
|
|
|
|
|
def kms_factory(kms_connection_configuration): |
|
return InMemoryKmsClient(kms_connection_configuration) |
|
|
|
|
|
crypto_factory = pe.CryptoFactory(kms_factory) |
|
parquet_encryption_cfg = ds.ParquetEncryptionConfig( |
|
crypto_factory, kms_connection_config, encryption_config) |
|
parquet_decryption_cfg = ds.ParquetDecryptionConfig(crypto_factory, |
|
kms_connection_config, |
|
decryption_config) |
|
|
|
|
|
pq_scan_opts = ds.ParquetFragmentScanOptions() |
|
pq_scan_opts.parquet_decryption_config = parquet_decryption_cfg |
|
pformat = pa.dataset.ParquetFileFormat(default_fragment_scan_options=pq_scan_opts) |
|
|
|
if os.path.exists('sample_dataset'): |
|
shutil.rmtree('sample_dataset') |
|
|
|
write_options = pformat.make_write_options( |
|
encryption_config=parquet_encryption_cfg) |
|
|
|
ds.write_dataset(data=dataset, base_dir="sample_dataset", |
|
partitioning=['year'], format=pformat, file_options=write_options) |
|
|
|
dataset = ds.dataset('sample_dataset', format=pformat) |
|
|
|
|
|
print(dataset.to_table()) |
|
|