Skip to content

[DLP] Implemented deidentifying and reidentifying of table using fpe #10234

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
289 changes: 289 additions & 0 deletions dlp/snippets/deid.py
Original file line number Diff line number Diff line change
Expand Up @@ -2040,6 +2040,188 @@ def deidentify_table_with_multiple_crypto_hash(
# [END dlp_deidentify_table_with_multiple_crypto_hash]


# [START dlp_deidentify_table_fpe]
from typing import List # noqa: F811, E402, I100

import google.cloud.dlp # noqa: F811, E402


def deidentify_table_with_fpe(
project: str,
table_header: List[str],
table_rows: List[List[str]],
deid_field_names: List[str],
key_name: str = None,
wrapped_key: bytes = None,
alphabet: str = None,
) -> None:
"""Uses the Data Loss Prevention API to de-identify sensitive data in a
table while maintaining format.

Args:
project: The Google Cloud project id to use as a parent resource.
table_header: List of strings representing table field names.
table_rows: List of rows representing table data.
deid_field_names: A list of fields in table to de-identify.
key_name: The name of the Cloud KMS key used to encrypt ('wrap') the
AES-256 key. Example:
key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/
keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'
wrapped_key: The decrypted ('wrapped', in bytes) AES-256 key to use. This key
should be encrypted using the Cloud KMS key specified by key_name.
alphabet: The set of characters to replace sensitive ones with. For
more information, see https://cloud.google.com/dlp/docs/reference/
rest/v2/projects.deidentifyTemplates#ffxcommonnativealphabet
"""

# Instantiate a client.
dlp = google.cloud.dlp_v2.DlpServiceClient()

# Construct the `table`. For more details on the table schema, please see
# https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table
headers = [{"name": val} for val in table_header]
rows = []
for row in table_rows:
rows.append({"values": [{"string_value": cell_val} for cell_val in row]})

table = {"headers": headers, "rows": rows}

# Construct the `item` for table.
item = {"table": table}

# Specify fields to be de-identified.
deid_field_names = [{"name": _i} for _i in deid_field_names]

# Construct FPE configuration dictionary
crypto_replace_ffx_fpe_config = {
"crypto_key": {
"kms_wrapped": {"wrapped_key": wrapped_key, "crypto_key_name": key_name},
},
"common_alphabet": alphabet,
}

# Construct deidentify configuration dictionary
deidentify_config = {
"record_transformations": {
"field_transformations": [
{
"primitive_transformation": {
"crypto_replace_ffx_fpe_config": crypto_replace_ffx_fpe_config
},
"fields": deid_field_names,
}
]
}
}

# Convert the project id into a full resource id.
parent = f"projects/{project}"

# Call the API.
response = dlp.deidentify_content(
request={
"parent": parent,
"deidentify_config": deidentify_config,
"item": item
})

# Print out results.
print(f"Table after de-identification: {response.item.table}")


# [END dlp_deidentify_table_fpe]


# [START dlp_reidentify_table_fpe]
from typing import List # noqa: F811, E402, I100

import google.cloud.dlp # noqa: F811, E402


def reidentify_table_with_fpe(
project: str,
table_header: List[str],
table_rows: List[List[str]],
reid_field_names: List[str],
key_name: str = None,
wrapped_key: bytes = None,
alphabet: str = None,
) -> None:
"""Uses the Data Loss Prevention API to re-identify sensitive data in a
table that was encrypted by Format Preserving Encryption (FPE).

Args:
project: The Google Cloud project id to use as a parent resource.
table_header: List of strings representing table field names.
table_rows: List of rows representing table data.
reid_field_names: A list of fields in table to re-identify.
key_name: The name of the Cloud KMS key used to encrypt ('wrap') the
AES-256 key. Example:
key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/
keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'
wrapped_key: The decrypted ('wrapped', in bytes) AES-256 key to use. This key
should be encrypted using the Cloud KMS key specified by key_name.
alphabet: The set of characters to replace sensitive ones with. For
more information, see https://cloud.google.com/dlp/docs/reference/
rest/v2/projects.deidentifyTemplates#ffxcommonnativealphabet
"""

# Instantiate a client.
dlp = google.cloud.dlp_v2.DlpServiceClient()

# Construct the `table`. For more details on the table schema, please see
# https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table
headers = [{"name": val} for val in table_header]
rows = []
for row in table_rows:
rows.append({"values": [{"string_value": cell_val} for cell_val in row]})
table = {"headers": headers, "rows": rows}

# Convert table to `item`
item = {"table": table}

# Specify fields to be re-identified/decrypted.
reid_field_names = [{"name": _i} for _i in reid_field_names]

# Construct FPE configuration dictionary
crypto_replace_ffx_fpe_config = {
"crypto_key": {
"kms_wrapped": {"wrapped_key": wrapped_key, "crypto_key_name": key_name}
},
"common_alphabet": alphabet,
}

# Construct reidentify configuration dictionary
reidentify_config = {
"record_transformations": {
"field_transformations": [
{
"primitive_transformation": {
"crypto_replace_ffx_fpe_config": crypto_replace_ffx_fpe_config,
},
"fields": reid_field_names,
}
]
}
}

# Convert the project id into a full resource id.
parent = f"projects/{project}"

# Call the API.
response = dlp.reidentify_content(
request={
"parent": parent,
"reidentify_config": reidentify_config,
"item": item,
})

# Print out results.
print("Table after re-identification: {}".format(response.item.table))


# [END dlp_reidentify_table_fpe]

if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
subparsers = parser.add_subparsers(
Expand Down Expand Up @@ -2588,6 +2770,93 @@ def deidentify_table_with_multiple_crypto_hash(
"deid_fields_2",
help="List of column names in table to de-identify using transient_key_name_2.",
)

table_fpe_parser = subparsers.add_parser(
"deid_table_fpe",
help="Deidentify sensitive data in a string using Format Preserving "
"Encryption (FPE).",
)
table_fpe_parser.add_argument(
"project",
help="The Google Cloud project id to use as a parent resource.",
)
table_fpe_parser.add_argument(
"table_header",
help="List of strings representing table field names.",
)
table_fpe_parser.add_argument(
"table_rows",
help="List of rows representing table data",
)
table_fpe_parser.add_argument(
"deid_field_names",
help="A list of fields in table to de-identify.",
)
table_fpe_parser.add_argument(
"key_name",
help="The name of the Cloud KMS key used to encrypt ('wrap') the "
"AES-256 key. Example: "
"key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/"
"keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'",
)
table_fpe_parser.add_argument(
"wrapped_key",
help="The encrypted ('wrapped') AES-256 key to use. This key should "
"be encrypted using the Cloud KMS key specified by key_name.",
)
table_fpe_parser.add_argument(
"-a",
"--alphabet",
default="ALPHA_NUMERIC",
help="The set of characters to replace sensitive ones with. Commonly "
'used subsets of the alphabet include "NUMERIC", "HEXADECIMAL", '
'"UPPER_CASE_ALPHA_NUMERIC", "ALPHA_NUMERIC", '
'"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"',
)

reid_table_fpe_parser = subparsers.add_parser(
"reid_table_fpe",
help="Re-identify sensitive data in a table using Format Preserving "
"Encryption (FPE).",
)
reid_table_fpe_parser.add_argument(
"project",
help="The Google Cloud project id to use as a parent resource.",
)
reid_table_fpe_parser.add_argument(
"table_header",
help="List of strings representing table field names.",
)
reid_table_fpe_parser.add_argument(
"table_rows",
help="List of rows representing table data",
)
reid_table_fpe_parser.add_argument(
"reid_field_names",
help="A list of fields in table to re-identify.",
)
reid_table_fpe_parser.add_argument(
"key_name",
help="The name of the Cloud KMS key used to encrypt ('wrap') the "
"AES-256 key. Example: "
"key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/"
"keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'",
)
reid_table_fpe_parser.add_argument(
"wrapped_key",
help="The encrypted ('wrapped') AES-256 key to use. This key should "
"be encrypted using the Cloud KMS key specified by key_name.",
)
reid_table_fpe_parser.add_argument(
"-a",
"--alphabet",
default="ALPHA_NUMERIC",
help="The set of characters to replace sensitive ones with. Commonly "
'used subsets of the alphabet include "NUMERIC", "HEXADECIMAL", '
'"UPPER_CASE_ALPHA_NUMERIC", "ALPHA_NUMERIC", '
'"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"',
)

args = parser.parse_args()

if args.content == "deid_mask":
Expand Down Expand Up @@ -2731,3 +3000,23 @@ def deidentify_table_with_multiple_crypto_hash(
args.deid_fields_1,
args.deid_fields_2,
)
elif args.content == "deid_table_fpe":
deidentify_table_with_fpe(
args.project,
args.table_header,
args.table_rows,
args.deid_field_names,
wrapped_key=base64.b64decode(args.wrapped_key),
key_name=args.key_name,
alphabet=args.alphabet,
)
elif args.content == "reid_table_fpe":
reidentify_table_with_fpe(
args.project,
args.table_header,
args.table_rows,
args.reid_field_names,
wrapped_key=base64.b64decode(args.wrapped_key),
key_name=args.key_name,
alphabet=args.alphabet,
)
53 changes: 52 additions & 1 deletion dlp/snippets/deid_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import base64
import os
import shutil
import tempfile
Expand Down Expand Up @@ -595,3 +595,54 @@ def test_deidentify_table_with_multiple_crypto_hash(
assert 'string_value: "abbyabernathy1"' not in out
assert "my userid is abbyabernathy1" in out
assert "[email protected]" not in out


def test_deidentify_and_reidentify_table_with_fpe(capsys: pytest.CaptureFixture) -> None:
table_data = {
"header": ["employee_id", "date", "compensation"],
"rows": [
["11111", "2015", "$10"],
["22222", "2016", "$20"],
["33333", "2016", "$15"],
]
}

deid.deidentify_table_with_fpe(
GCLOUD_PROJECT,
table_data["header"],
table_data["rows"],
["employee_id"],
alphabet='NUMERIC',
wrapped_key=base64.b64decode(WRAPPED_KEY),
key_name=KEY_NAME,
)

out, _ = capsys.readouterr()
assert "11111" not in out
assert "22222" not in out

response = out.split(":")[1:]

deid_col_id = response.index(' "employee_id"\n}\nheaders {\n name')
total_columns = len(table_data['header'])
total_rows = len(table_data['rows'][0])

deid_emp_ids = [response[i].split("\n")[0][2:-1] for i in
range(deid_col_id + total_columns, len(response), total_columns)]

for i in range(total_rows):
table_data['rows'][i][deid_col_id - 1] = deid_emp_ids[i]

deid.reidentify_table_with_fpe(
GCLOUD_PROJECT,
table_data["header"],
table_data["rows"],
["employee_id"],
alphabet='NUMERIC',
wrapped_key=base64.b64decode(WRAPPED_KEY),
key_name=KEY_NAME,
)

out, _ = capsys.readouterr()
assert "11111" in out
assert "22222" in out