Skip to main content
POST
/
api
/
v1
/
datadiffs
Create a data diff
curl --request POST \
  --url https://app.datafold.com/api/v1/datadiffs \
  --header 'Authorization: <api-key>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "data_source1_id": 123,
  "data_source2_id": 123,
  "pk_columns": [
    "<string>"
  ],
  "archived": false,
  "bisection_factor": 123,
  "bisection_threshold": 123,
  "column_mapping": [
    [
      "<string>",
      "<string>"
    ]
  ],
  "columns_to_compare": [
    "<string>"
  ],
  "compare_duplicates": true,
  "data_source1_session_parameters": {},
  "data_source2_session_parameters": {},
  "datetime_tolerance": 123,
  "diff_tolerance": 123,
  "diff_tolerances_per_column": [
    {
      "column_name": "<string>",
      "tolerance_mode": "absolute",
      "tolerance_value": 123
    }
  ],
  "download_limit": 123,
  "exclude_columns": [
    "<string>"
  ],
  "file1": "<string>",
  "file1_options": {
    "delimiter": "<string>",
    "file_type": "csv",
    "skip_head_rows": 123,
    "skip_tail_rows": 123
  },
  "file2": "<string>",
  "file2_options": {
    "delimiter": "<string>",
    "file_type": "csv",
    "skip_head_rows": 123,
    "skip_tail_rows": 123
  },
  "filter1": "<string>",
  "filter2": "<string>",
  "include_columns": [
    "<string>"
  ],
  "materialization_destination_id": 123,
  "materialize_dataset1": true,
  "materialize_dataset2": true,
  "materialize_without_sampling": false,
  "per_column_diff_limit": 123,
  "purged": false,
  "query1": "<string>",
  "query2": "<string>",
  "run_profiles": true,
  "sampling_confidence": 123,
  "sampling_max_rows": 123,
  "sampling_ratio": 123,
  "sampling_threshold": 123,
  "sampling_tolerance": 123,
  "table1": [
    "<string>"
  ],
  "table2": [
    "<string>"
  ],
  "table_modifiers": [
    "case_insensitive_strings"
  ],
  "tags": [
    "<string>"
  ],
  "time_aggregate": "minute",
  "time_column": "<string>",
  "time_interval_end": "2023-11-07T05:31:56Z",
  "time_interval_start": "2023-11-07T05:31:56Z",
  "time_travel_point1": 123,
  "time_travel_point2": 123,
  "tolerance_mode": "absolute"
}
'
{
  "data_source1_id": 123,
  "data_source2_id": 123,
  "kind": "in_db",
  "pk_columns": [
    "<string>"
  ],
  "affected_columns": [
    "<string>"
  ],
  "algorithm": "join",
  "archived": false,
  "bisection_factor": 123,
  "bisection_threshold": 123,
  "ci_base_branch": "<string>",
  "ci_pr_branch": "<string>",
  "ci_pr_num": 123,
  "ci_pr_sha": "<string>",
  "ci_pr_url": "<string>",
  "ci_pr_user_display_name": "<string>",
  "ci_pr_user_email": "<string>",
  "ci_pr_user_id": "<string>",
  "ci_pr_username": "<string>",
  "ci_run_id": 123,
  "ci_sha_url": "<string>",
  "column_mapping": [
    [
      "<string>",
      "<string>"
    ]
  ],
  "columns_to_compare": [
    "<string>"
  ],
  "compare_duplicates": true,
  "created_at": "2023-11-07T05:31:56Z",
  "data_app_metadata": {
    "data_app_id": 123,
    "data_app_model_type": "<string>",
    "meta_data": {},
    "data_app_model1_id": "<string>",
    "data_app_model1_name": "<string>",
    "data_app_model2_id": "<string>",
    "data_app_model2_name": "<string>"
  },
  "data_app_type": "<string>",
  "data_source1_session_parameters": {},
  "data_source2_session_parameters": {},
  "datetime_tolerance": 123,
  "diff_stats": {
    "version": "<string>",
    "diff_duplicate_pks": 123,
    "diff_null_pks": 123,
    "diff_pks": 123,
    "diff_rows": 123,
    "diff_rows_count": 123,
    "diff_rows_number": 123,
    "diff_schema": 123,
    "diff_values": 123,
    "errors": 123,
    "match_ratio": 123,
    "rows_added": 123,
    "rows_removed": 123,
    "sampled": true,
    "table_a_row_count": 123,
    "table_b_row_count": 123
  },
  "diff_tolerance": 123,
  "diff_tolerances_per_column": [
    {
      "column_name": "<string>",
      "tolerance_mode": "absolute",
      "tolerance_value": 123
    }
  ],
  "done": false,
  "download_limit": 123,
  "exclude_columns": [
    "<string>"
  ],
  "execute_as_user": true,
  "file1": "<string>",
  "file1_options": {
    "delimiter": "<string>",
    "file_type": "csv",
    "skip_head_rows": 123,
    "skip_tail_rows": 123
  },
  "file2": "<string>",
  "file2_options": {
    "delimiter": "<string>",
    "file_type": "csv",
    "skip_head_rows": 123,
    "skip_tail_rows": 123
  },
  "filter1": "<string>",
  "filter2": "<string>",
  "finished_at": "2023-11-07T05:31:56Z",
  "id": 123,
  "include_columns": [
    "<string>"
  ],
  "materialization_destination_id": 123,
  "materialize_dataset1": true,
  "materialize_dataset2": true,
  "materialize_without_sampling": false,
  "monitor_error": {
    "error_type": "<string>",
    "error_value": "<string>"
  },
  "monitor_id": 123,
  "monitor_state": "ok",
  "per_column_diff_limit": 123,
  "purged": false,
  "query1": "<string>",
  "query2": "<string>",
  "result": "error",
  "result_revisions": {},
  "result_statuses": {},
  "run_profiles": true,
  "runtime": 123,
  "sampling_confidence": 123,
  "sampling_max_rows": 123,
  "sampling_ratio": 123,
  "sampling_threshold": 123,
  "sampling_tolerance": 123,
  "share_token": "<string>",
  "share_url": "<string>",
  "source": "interactive",
  "status": "needs_confirmation",
  "table1": [
    "<string>"
  ],
  "table2": [
    "<string>"
  ],
  "table_modifiers": [
    "case_insensitive_strings"
  ],
  "tags": [
    "<string>"
  ],
  "temp_schema_override": [
    "<string>"
  ],
  "time_aggregate": "minute",
  "time_column": "<string>",
  "time_interval_end": "2023-11-07T05:31:56Z",
  "time_interval_start": "2023-11-07T05:31:56Z",
  "time_travel_point1": 123,
  "time_travel_point2": 123,
  "tolerance_mode": "absolute",
  "updated_at": "2023-11-07T05:31:56Z",
  "user_id": 123
}

Authorizations

Authorization
string
header
required

Use the 'Authorization' header with the format 'Key '

Body

application/json
data_source1_id
integer
required

ID of the first data source (Dataset A).

data_source2_id
integer
required

ID of the second data source (Dataset B). Can be the same as data_source1_id.

pk_columns
string[]
required

Column names that uniquely identify rows, e.g. ['id'] or ['tenant_id', 'order_id']. Must match actual column names in both datasets.

archived
boolean
default:false
bisection_factor
integer | null
bisection_threshold
integer | null
column_mapping
tuple[] | null

Map columns with different names between datasets. List of [column_in_A, column_in_B] pairs.

columns_to_compare
string[] | null

Columns to compare between datasets. If set, only these columns are diffed (primary key columns are always included). Column names must match the dataset schema.

compare_duplicates
boolean | null

Compare rows with duplicate primary keys. Defaults to true.

data_source1_session_parameters
Data Source1 Session Parameters · object

Snowflake session parameters for Dataset A, e.g. {"QUERY_TAG": "datadiff", "WAREHOUSE": "COMPUTE_WH"}.

data_source2_session_parameters
Data Source2 Session Parameters · object

Snowflake session parameters for Dataset B.

datetime_tolerance
integer | null

Datetime precision for comparison. 0=seconds, 1=tenths, 2=hundredths, 3=milliseconds, 4=tenth-ms, 5=hundredth-ms, 6=microseconds.

diff_tolerance
number | null

Default tolerance for float comparisons. In absolute mode: values within this distance are equal. In relative mode: fraction of difference allowed.

diff_tolerances_per_column
ColumnTolerance · object[] | null

Per-column tolerance overrides. Each entry: {column_name, tolerance_value (>= 0), tolerance_mode: 'absolute'|'relative'}.

download_limit
integer | null
exclude_columns
string[] | null

Columns to exclude from comparison. Ignored if include_columns is set.

file1
string<uri> | null

File URL for Dataset A (s3://, gs://, abfss://, https://). Mutually exclusive with table1 and query1. Requires file1_options.

Minimum string length: 1
file1_options
CSVFileOptions · object

File format options for file1 (file_type, delimiter, sheet, skip rows).

file2
string<uri> | null

File URL for Dataset B (s3://, gs://, abfss://, https://). Mutually exclusive with table2 and query2. Requires file2_options.

Minimum string length: 1
file2_options
CSVFileOptions · object

File format options for file2 (file_type, delimiter, sheet, skip rows).

filter1
string | null

SQL WHERE clause for Dataset A (omit the WHERE keyword), e.g. 'status = 1'.

filter2
string | null

SQL WHERE clause for Dataset B (omit the WHERE keyword), e.g. 'status = 1'.

include_columns
string[] | null

Explicit list of columns to compare. If set, only these columns are diffed.

materialization_destination_id
integer | null

Data source ID where materialized diff results are stored.

materialize_dataset1
boolean | null

Materialize Dataset A before diffing. Improves speed for heavy queries, filtered non-indexed columns, or transformed primary keys.

materialize_dataset2
boolean | null

Materialize Dataset B before diffing. Same use cases as materialize_dataset1.

materialize_without_sampling
boolean | null
default:false

Skip sampling when materializing results.

per_column_diff_limit
integer | null
purged
boolean
default:false
query1
string | null

SQL query for Dataset A. Mutually exclusive with table1 and file1.

query2
string | null

SQL query for Dataset B. Mutually exclusive with table2 and file2.

run_profiles
boolean | null

Run column profiling on diff results.

sampling_confidence
number | null

Sampling confidence level, between 0 and 100 exclusive. Common values: 90, 95, 99, 99.5, 99.9. Use with sampling_tolerance.

sampling_max_rows
integer | null

Maximum number of rows to sample (absolute count). Alternative to tolerance+confidence and sampling_ratio.

sampling_ratio
number | null

Sample this fraction of rows. Value between 0 and 1 exclusive (e.g. 0.1 = 10% of rows). Alternative to tolerance+confidence.

sampling_threshold
integer | null

Minimum row count to activate sampling. Sampling is disabled if the largest table has fewer rows than this.

sampling_tolerance
number | null

Sampling tolerance: max fraction of rows with PK errors before sampling is disabled. Value between 0 and 1 exclusive (e.g. 0.001 = 0.1%). Use with sampling_confidence.

table1
string[] | null

Table path for Dataset A as a list of path components, e.g. ['schema', 'table'] or ['database', 'schema', 'table']. Mutually exclusive with query1 and file1.

table2
string[] | null

Table path for Dataset B as a list of path components, e.g. ['schema', 'table'] or ['database', 'schema', 'table']. Mutually exclusive with query2 and file2.

table_modifiers
enum<string>[] | null

Table-level modifiers. Use ['case_insensitive_strings'] to ignore string case.

Available options:
case_insensitive_strings
tags
string[] | null

Tags for organizing and filtering diffs.

time_aggregate
enum<string> | null

Time aggregation level when using time_column.

Available options:
minute,
hour,
day,
week,
month,
year
time_column
string | null

Column name used for time-based filtering or aggregation.

time_interval_end
string<date-time> | null
time_interval_start
string<date-time> | null
time_travel_point1

Time travel point for Dataset A. Accepts: negative integer offset (e.g. -130), UTC timestamp (e.g. '2024-01-15T00:00:00'), or a time point hash. Only supported by Snowflake and Databricks.

time_travel_point2

Time travel point for Dataset B. Same format as time_travel_point1.

tolerance_mode
enum<string> | null

How diff_tolerance is applied: 'absolute' or 'relative'.

Available options:
absolute,
relative

Response

Successful Response

data_source1_id
integer
required

ID of the first data source (Dataset A).

data_source2_id
integer
required

ID of the second data source (Dataset B). Can be the same as data_source1_id.

kind
enum<string>
required
Available options:
in_db,
cross_db
pk_columns
string[]
required

Column names that uniquely identify rows, e.g. ['id'] or ['tenant_id', 'order_id']. Must match actual column names in both datasets.

affected_columns
string[] | null
algorithm
enum<string> | null

Diff algorithm. 'join' for same-database diffs, 'fetch_and_join' for cross-database or file diffs. Auto-selected if omitted: 'join' when both data sources are the same, 'fetch_and_join' otherwise.

Available options:
join,
hash,
hash_v2_alpha,
fetch_and_join
archived
boolean
default:false
bisection_factor
integer | null
bisection_threshold
integer | null
ci_base_branch
string | null
ci_pr_branch
string | null
ci_pr_num
integer | null
ci_pr_sha
string | null
ci_pr_url
string | null
ci_pr_user_display_name
string | null
ci_pr_user_email
string | null
ci_pr_user_id
string | null
ci_pr_username
string | null
ci_run_id
integer | null
ci_sha_url
string | null
column_mapping
tuple[] | null

Map columns with different names between datasets. List of [column_in_A, column_in_B] pairs.

columns_to_compare
string[] | null

Columns to compare between datasets. If set, only these columns are diffed (primary key columns are always included). Column names must match the dataset schema.

compare_duplicates
boolean | null

Compare rows with duplicate primary keys. Defaults to true.

created_at
string<date-time> | null
data_app_metadata
TDataDiffDataAppMetadata · object
data_app_type
string | null
data_source1_session_parameters
Data Source1 Session Parameters · object

Snowflake session parameters for Dataset A, e.g. {"QUERY_TAG": "datadiff", "WAREHOUSE": "COMPUTE_WH"}.

data_source2_session_parameters
Data Source2 Session Parameters · object

Snowflake session parameters for Dataset B.

datetime_tolerance
integer | null

Datetime precision for comparison. 0=seconds, 1=tenths, 2=hundredths, 3=milliseconds, 4=tenth-ms, 5=hundredth-ms, 6=microseconds.

diff_stats
DiffStats · object
diff_tolerance
number | null

Default tolerance for float comparisons. In absolute mode: values within this distance are equal. In relative mode: fraction of difference allowed.

diff_tolerances_per_column
ColumnTolerance · object[] | null

Per-column tolerance overrides. Each entry: {column_name, tolerance_value (>= 0), tolerance_mode: 'absolute'|'relative'}.

done
boolean | null
default:false
download_limit
integer | null
exclude_columns
string[] | null

Columns to exclude from comparison. Ignored if include_columns is set.

execute_as_user
boolean | null
file1
string<uri> | null

File URL for Dataset A (s3://, gs://, abfss://, https://). Mutually exclusive with table1 and query1. Requires file1_options.

Minimum string length: 1
file1_options
CSVFileOptions · object

File format options for file1 (file_type, delimiter, sheet, skip rows).

file2
string<uri> | null

File URL for Dataset B (s3://, gs://, abfss://, https://). Mutually exclusive with table2 and query2. Requires file2_options.

Minimum string length: 1
file2_options
CSVFileOptions · object

File format options for file2 (file_type, delimiter, sheet, skip rows).

filter1
string | null

SQL WHERE clause for Dataset A (omit the WHERE keyword), e.g. 'status = 1'.

filter2
string | null

SQL WHERE clause for Dataset B (omit the WHERE keyword), e.g. 'status = 1'.

finished_at
string<date-time> | null
id
integer | null
include_columns
string[] | null

Explicit list of columns to compare. If set, only these columns are diffed.

materialization_destination_id
integer | null

Data source ID where materialized diff results are stored.

materialize_dataset1
boolean | null

Materialize Dataset A before diffing. Improves speed for heavy queries, filtered non-indexed columns, or transformed primary keys.

materialize_dataset2
boolean | null

Materialize Dataset B before diffing. Same use cases as materialize_dataset1.

materialize_without_sampling
boolean | null
default:false

Skip sampling when materializing results.

monitor_error
QueryError · object
monitor_id
integer | null
monitor_state
enum<string> | null
Available options:
ok,
alert,
error,
learning,
checking,
created,
skipped,
cancelled
per_column_diff_limit
integer | null
purged
boolean
default:false
query1
string | null

SQL query for Dataset A. Mutually exclusive with table1 and file1.

query2
string | null

SQL query for Dataset B. Mutually exclusive with table2 and file2.

result
enum<string> | null
Available options:
error,
bad-pks,
different,
missing-pks,
identical,
empty
result_revisions
Result Revisions · object
result_statuses
Result Statuses · object
run_profiles
boolean | null

Run column profiling on diff results.

runtime
number | null
sampling_confidence
number | null

Sampling confidence level, between 0 and 100 exclusive. Common values: 90, 95, 99, 99.5, 99.9. Use with sampling_tolerance.

sampling_max_rows
integer | null

Maximum number of rows to sample (absolute count). Alternative to tolerance+confidence and sampling_ratio.

sampling_ratio
number | null

Sample this fraction of rows. Value between 0 and 1 exclusive (e.g. 0.1 = 10% of rows). Alternative to tolerance+confidence.

sampling_threshold
integer | null

Minimum row count to activate sampling. Sampling is disabled if the largest table has fewer rows than this.

sampling_tolerance
number | null

Sampling tolerance: max fraction of rows with PK errors before sampling is disabled. Value between 0 and 1 exclusive (e.g. 0.001 = 0.1%). Use with sampling_confidence.

share_token
string | null
share_url
string | null
source
enum<string> | null
Available options:
interactive,
demo_signup,
manual,
api,
ci,
schedule,
auto
status
enum<string> | null
Available options:
needs_confirmation,
needs_authentication,
waiting,
processing,
done,
failed,
cancelled
table1
string[] | null

Table path for Dataset A as a list of path components, e.g. ['schema', 'table'] or ['database', 'schema', 'table']. Mutually exclusive with query1 and file1.

table2
string[] | null

Table path for Dataset B as a list of path components, e.g. ['schema', 'table'] or ['database', 'schema', 'table']. Mutually exclusive with query2 and file2.

table_modifiers
enum<string>[] | null

Table-level modifiers. Use ['case_insensitive_strings'] to ignore string case.

Available options:
case_insensitive_strings
tags
string[] | null

Tags for organizing and filtering diffs.

temp_schema_override
string[] | null
time_aggregate
enum<string> | null

Time aggregation level when using time_column.

Available options:
minute,
hour,
day,
week,
month,
year
time_column
string | null

Column name used for time-based filtering or aggregation.

time_interval_end
string<date-time> | null
time_interval_start
string<date-time> | null
time_travel_point1

Time travel point for Dataset A. Accepts: negative integer offset (e.g. -130), UTC timestamp (e.g. '2024-01-15T00:00:00'), or a time point hash. Only supported by Snowflake and Databricks.

time_travel_point2

Time travel point for Dataset B. Same format as time_travel_point1.

tolerance_mode
enum<string> | null

How diff_tolerance is applied: 'absolute' or 'relative'.

Available options:
absolute,
relative
updated_at
string<date-time> | null
user_id
integer | null