Hi everyone,
I am initializing a context and adding data sources and assets the following way:
config_data_docs_sites = {
"s3_site": {
"class_name": "SiteBuilder",
"store_backend": {
"class_name": "TupleS3StoreBackend",
"bucket": "great-expectations",
"prefix": "data_docs",
"boto3_options": BOTO3_OPTIONS
},
},
}
data_context_config = DataContextConfig(
store_backend_defaults=S3StoreBackendDefaults(default_bucket_name=GX_BUCKET_NAME),
data_docs_sites=config_data_docs_sites
)
context = BaseDataContext(project_config=data_context_config)
datasource_name = f"gx_temp_{latest_filename.replace('/', '_')}_{latest_version}"
bucket_name = TEMP_DESTINATION_BUCKET
datasource = context.sources.add_or_update_pandas_s3(
name=datasource_name, bucket=bucket_name, boto3_options=BOTO3_OPTIONS
)
for file_name_df in dataasset_filelist:
file_name_df = file_name_df.replace('/', '_')
if file_name_df.endswith('df1'):
asset_name = f"{latest_filename}_{latest_version}_df1_{file_name_df}"
s3_prefix = f"{latest_filename}/{latest_version}"
batching_regex = file_name_df.split("_")[2] + "_" + file_name_df.split("_")[3]
datasource.add_parquet_asset(name=asset_name, s3_prefix=s3_prefix, batching_regex=batching_regex)
elif file_name_df.endswith('df2'):
asset_name = f"{latest_filename}_{latest_version}_df2_{file_name_df}"
s3_prefix = f"{latest_filename}/{latest_version}"
batching_regex = file_name_df.split("_")[2] + "_" + file_name_df.split("_")[3]
datasource.add_parquet_asset(name=asset_name, s3_prefix=s3_prefix, batching_regex=batching_regex)
elif file_name_df.endswith('df3'):
asset_name = f"{latest_filename}_{latest_version}_df3_{file_name_df}"
s3_prefix = f"{latest_filename}/{latest_version}"
batching_regex = file_name_df.split("_")[2] + "_" + file_name_df.split("_")[3]
datasource.add_parquet_asset(name=asset_name, s3_prefix=s3_prefix, batching_regex=batching_regex)
elif file_name_df.endswith('df4'):
asset_name = f"{latest_filename}_{latest_version}_df4_{file_name_df}"
s3_prefix = f"{latest_filename}/{latest_version}"
batching_regex = file_name_df.split("_")[2] + "_" + file_name_df.split("_")[3]
datasource.add_parquet_asset(name=asset_name, s3_prefix=s3_prefix, batching_regex=batching_regex)
validations_df1 = []
validations_df2 = []
validations_df3 = []
validations_df4 = []
for asset_name in asset_names:
data_asset = data_source.get_asset(asset_name)
my_batch_request = data_asset.build_batch_request()
if asset_name.endswith('df1'):
expectation = 'Exp_Abteilung'
validations_df1.append(
{"batch_request": my_batch_request, "expectation_suite_name": expectation}
)
elif asset_name.endswith('df2'):
expectation = 'Exp_Person'
validations_df2.append(
{"batch_request": my_batch_request, "expectation_suite_name": expectation}
)
elif asset_name.endswith('df3'):
expectation = 'Exp_Ausruestung'
validations_df3.append(
{"batch_request": my_batch_request, "expectation_suite_name": expectation}
)
else:
expectation = 'Exp_Combined'
validations_df4.append(
{"batch_request": my_batch_request, "expectation_suite_name": expectation}
)
checkpoint1 = context.add_or_update_checkpoint(
name='checkpoint_df1',
validations=validations_df1,
run_name_template=f'{latest_file_name}_{latest_version}_df1',
action_list=[
{
'name': 'store_validation_result',
'action': {'class_name': 'StoreValidationResultAction'}
},
{
'name': 'store_evaluation_params',
'action': {'class_name': 'StoreEvaluationParametersAction'}
},
{
'name': 'update_data_docs',
'action': {'class_name': 'UpdateDataDocsAction'
}
}
]
)
results1 = checkpoint1.run()
GX is updating the data docs after each validation in the list validations_df1 has been validated, which leads to very slow execution times. I am searching for a way to first validate all the validatiions in the list and only then to update the data docs to include the new validations together with the old ones, without rebuilding the entire data docs from scratch. I need to have all validations visible in data docs but cannot afford long runtimes.}
Thank you for your help.