I tried to follow it but im stuck with validation definition. I encountered an error while retrieving batch definition.
import great_expectations as gx
import pandas as pd
from scripts.base import Base
from great_expectations import expectations as gxe
Initialize Base instance
base = Base()
Define index value
index_value = 0
Query source and destination tables
source = base.query_tables(‘Raw_Data’, ‘', '’)
destination = base.query_tables(‘', ‘DEV-SQL02’, '*’)
Extract relevant data from source
taxsource = source[‘TaxIdentificationNumber’].iloc[index_value]
firstgroup = str(source[‘GroupName’].iloc[index_value]).upper()
firstlast = str(source[‘ProviderLastName’].iloc[index_value]).upper()
firstaddress = str(source[‘ProviderAddress1’].iloc[index_value]).upper()[:4]
Debug output
print(firstaddress)
print(type(firstaddress))
print(taxsource)
Extract relevant data from destination
tax = destination[‘TaxID’]
totid = destination[
(tax == taxsource) &
(destination[‘Hospital_N’] == firstgroup) &
(destination[‘Last’] == firstlast) &
(destination[‘Address’].str.startswith(firstaddress))
]
firstID = totid[‘Tothpdpyid’].iloc[index_value]
firsttax = totid[‘TaxID’].iloc[index_value]
print(firsttax)
Create a DataFrame to merge
merge_data = pd.DataFrame({‘TaxIdentificationNumber’: [taxsource], ‘TaxID’: [firsttax]})
print(merge_data)
Set up Great Expectations context
context = gx.get_context()
print(type(context).name)
data_source = context.data_sources.add_pandas(‘totid’)
data_asset = data_source.add_dataframe_asset(name=“pd dataframe asset”)
batch_definition = data_asset.add_batch_definition_whole_dataframe(“batch definition”)
batch = batch_definition.get_batch(batch_parameters={“dataframe”: merge_data})
create expectation suite
suite_name = “my_expectation_suite”
suite = gx.ExpectationSuite(name=suite_name)
suite = context.suites.add(suite)
existing_suite_name = (
“my_expectation_suite” # replace this with the name of your Expectation Suite
)
suite = context.suites.get(name=existing_suite_name)
Define and validate expectations
great_expectation = gx.expectations.ExpectColumnPairValuesToBeEqual(
column_A=“TaxIdentificationNumber”,
column_B=“TaxID”
)
suite.add_expectation(great_expectation)
great_expectation.save()
validation_results = batch.validate(great_expectation)
print(validation_results)
create validation definition
expectation_suite_name = “my_expectation_suite”
expectation_suite = context.suites.get(name=expectation_suite_name)
#retrieve batch definition
data_source_name = “my_data_source”
data_asset_name = “my_data_asset”
batch_definition_name = “my_batch_definition”
batch_definition = (
context.data_sources.get(data_source_name)
.get_asset(data_asset_name)
.get_batch_definition(batch_definition_name)
)
#create validation definition
definition_name = “my_validation_definition”
validation_definition = gx.ValidationDefinition(
data=batch_definition, suite=expectation_suite, name=definition_name
)
Save the Validation Definition to your Data Context
validation_definition = context.validation_definitions.add(validation_definition)
validation_definition_name = “my_validation_definition”
validation_definition = context.validation_definitions.get(validation_definition_name)
validation_results = validation_definition.run()
print(validation_results)
# context.build_data_docs()
Do you know where should the values below will come from?data_source_name = “my_data_source”
data_asset_name = “my_data_asset”
batch_definition_name = “my_batch_definition”