Things on this page are fragmentary and immature notes/thoughts of the author. Please read with your own judgement!
In [2]:
!pip3 install sqlfluff
Parse the SQL file Using sqlfluff parse
¶
You can also use the Python API sqlfluff.parse
to parse a SQL query/file,
however,
you have to manually handle exceptions at this time.
In [1]:
!sqlfluff parse example.sql > example_parse.log
Parse the Log File to Extract Identifier Names¶
In [ ]:
import re
from pathlib import Path
from typing import Set
In [ ]:
def extract_identifier(line):
identifier = line.strip().rsplit(maxsplit=1)[-1]
return identifier[1:-1]
def extract_reference(lines, idx):
if "dot:" in lines[idx + 2]:
return extract_identifier(lines[idx + 1]), extract_identifier(lines[idx + 3])
return "", extract_identifier(lines[idx + 1])
def gen_mapping(names: Set[str], prefix):
mapping = {}
idx = 1
for name in names:
if name in mapping:
continue
name_lower = name.lower()
if name_lower in mapping:
mapping[name] = mapping[name_lower]
continue
mapping[name] = prefix + str(idx)
mapping[name_lower] = mapping[name]
idx += 1
return mapping
def anonymize(sql, *mappings):
for mapping in mappings:
for key, val in mapping.items():
sql = sql.replace(key, val)
return sql
In [ ]:
with Path("parse_example.log").open() as fin:
lines = fin.readlines()
col_refs = [
extract_reference(lines, idx)
for idx, line in enumerate(lines)
if re.search("\|\s*column_reference:", line)
]
cols = set(col for _, col in col_refs)
cols
In [ ]:
tab_refs = [
extract_table_reference(lines, idx)
for idx, line in enumerate(lines)
if re.search("\|\s*table_reference:", line)
]
dbs = set(db for db, _ in tab_refs if db)
dbs
In [ ]:
tables = set(table for _, table in tab_refs if table)
tables
Anonymize a SQL File¶
Once you get all database, table and column names from a SQL query, you can anonymize the SQL query by replacing database, table and column names with non-sensitive names.
In [ ]:
sql = Path("example.sql").read_text()
sql_anon = anonymize(
sql,
gen_mapping(dbs, "db_"),
gen_mapping(tables, "table_"),
gen_mapping(cols, "col_"),
)