Appending validations to data docs

Hi everyone,
I am initializing a context and adding data sources and assets the following way:

config_data_docs_sites = {
        "s3_site": {
            "class_name": "SiteBuilder",
            "store_backend": {
                "class_name": "TupleS3StoreBackend",
                "bucket": "great-expectations",
                "prefix": "data_docs",
                "boto3_options": BOTO3_OPTIONS
            },
        },
    }
    data_context_config = DataContextConfig(
        store_backend_defaults=S3StoreBackendDefaults(default_bucket_name=GX_BUCKET_NAME),
        data_docs_sites=config_data_docs_sites
    )
    context = BaseDataContext(project_config=data_context_config)

 datasource_name = f"gx_temp_{latest_filename.replace('/', '_')}_{latest_version}"
    bucket_name = TEMP_DESTINATION_BUCKET
    datasource = context.sources.add_or_update_pandas_s3(
        name=datasource_name, bucket=bucket_name, boto3_options=BOTO3_OPTIONS
    )

for file_name_df in dataasset_filelist:
        file_name_df = file_name_df.replace('/', '_')
        if file_name_df.endswith('df1'):
            asset_name = f"{latest_filename}_{latest_version}_df1_{file_name_df}"
            s3_prefix = f"{latest_filename}/{latest_version}"
            batching_regex = file_name_df.split("_")[2] + "_" + file_name_df.split("_")[3]
            datasource.add_parquet_asset(name=asset_name, s3_prefix=s3_prefix, batching_regex=batching_regex)
        elif file_name_df.endswith('df2'):
            asset_name = f"{latest_filename}_{latest_version}_df2_{file_name_df}"
            s3_prefix = f"{latest_filename}/{latest_version}"
            batching_regex = file_name_df.split("_")[2] + "_" + file_name_df.split("_")[3]
            datasource.add_parquet_asset(name=asset_name, s3_prefix=s3_prefix, batching_regex=batching_regex)
        elif file_name_df.endswith('df3'):
            asset_name = f"{latest_filename}_{latest_version}_df3_{file_name_df}"
            s3_prefix = f"{latest_filename}/{latest_version}"
            batching_regex = file_name_df.split("_")[2] + "_" + file_name_df.split("_")[3]
            datasource.add_parquet_asset(name=asset_name, s3_prefix=s3_prefix, batching_regex=batching_regex)
        elif file_name_df.endswith('df4'):
            asset_name = f"{latest_filename}_{latest_version}_df4_{file_name_df}"
            s3_prefix = f"{latest_filename}/{latest_version}"
            batching_regex = file_name_df.split("_")[2] + "_" + file_name_df.split("_")[3]
            datasource.add_parquet_asset(name=asset_name, s3_prefix=s3_prefix, batching_regex=batching_regex)

validations_df1 = []
    validations_df2 = []
    validations_df3 = []
    validations_df4 = []
    for asset_name in asset_names:
        data_asset = data_source.get_asset(asset_name)
        my_batch_request = data_asset.build_batch_request()

        if asset_name.endswith('df1'):
            expectation = 'Exp_Abteilung'
            validations_df1.append(
                {"batch_request": my_batch_request, "expectation_suite_name": expectation}
            )
        elif asset_name.endswith('df2'):
            expectation = 'Exp_Person'
            validations_df2.append(
                {"batch_request": my_batch_request, "expectation_suite_name": expectation}
            )
        elif asset_name.endswith('df3'):
            expectation = 'Exp_Ausruestung'
            validations_df3.append(
                {"batch_request": my_batch_request, "expectation_suite_name": expectation}
            )
        else:
            expectation = 'Exp_Combined'
            validations_df4.append(
                {"batch_request": my_batch_request, "expectation_suite_name": expectation}
            )

checkpoint1 = context.add_or_update_checkpoint(
        name='checkpoint_df1',
        validations=validations_df1,
        run_name_template=f'{latest_file_name}_{latest_version}_df1',
        action_list=[
            {
                'name': 'store_validation_result',
                'action': {'class_name': 'StoreValidationResultAction'}
            },
            {
                'name': 'store_evaluation_params',
                'action': {'class_name': 'StoreEvaluationParametersAction'}
            },
            {
                'name': 'update_data_docs',
                'action': {'class_name': 'UpdateDataDocsAction'
                           }
            }
        ]
    )

results1 = checkpoint1.run()

GX is updating the data docs after each validation in the list validations_df1 has been validated, which leads to very slow execution times. I am searching for a way to first validate all the validatiions in the list and only then to update the data docs to include the new validations together with the old ones, without rebuilding the entire data docs from scratch. I need to have all validations visible in data docs but cannot afford long runtimes.}
Thank you for your help.

Remove the third actions from your checkpoints action_list update_data_docs and only at the end of your script/pipeline run python command context.build_data_docs() or on the CLI great_expectations docs_build
Note: @Lauri Huhta originally posted this reply in Slack. It might not have transferred perfectly.

context.build_data_docs()

this rebuilds the data docs completely. I need something to append the new validations to the existing data docs. Fully rebuilding the docs will get slow as more validations pile on.

Hi erman, I’m not sure if this is compatible with your use case, but have you looked into using UpdateDataDocsAction and specifying a list of sites via the site_names paramenter?

hi nevintan,
yes i tried it but it’s the same as if i don’t specify a site name.