Merge pull request #9649 from RasmusWL/certificate-modeling

Python/JS/Ruby: Ignore common words (like certain) as sensitive data source
This commit is contained in:
Rasmus Wriedt Larsen
2022-06-23 12:04:58 +02:00
committed by GitHub
7 changed files with 39 additions and 6 deletions

View File

@@ -0,0 +1,4 @@
---
category: minorAnalysis
---
* Improved modeling of sensitive data sources, so common words like `certain` and `secretary` are no longer considered a certificate and a secret (respectively).

View File

@@ -50,7 +50,7 @@ module HeuristicNames {
* Gets a regular expression that identifies strings that may indicate the presence of secret
* or trusted data.
*/
string maybeSecret() { result = "(?is).*((?<!is)secret|(?<!un|is)trusted).*" }
string maybeSecret() { result = "(?is).*((?<!is|is_)secret|(?<!un|un_|is|is_)trusted).*" }
/**
* Gets a regular expression that identifies strings that may indicate the presence of
@@ -96,10 +96,14 @@ module HeuristicNames {
* Gets a regular expression that identifies strings that may indicate the presence of data
* that is hashed or encrypted, and hence rendered non-sensitive, or contains special characters
* suggesting nouns within the string do not represent the meaning of the whole string (e.g. a URL or a SQL query).
*
* We also filter out common words like `certain` and `concert`, since otherwise these could
* be matched by the certificate regular expressions. Same for `accountable` (account), or
* `secretarial` (secret).
*/
string notSensitiveRegexp() {
result =
"(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|code)).*"
"(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?<!un)(en))?(crypt|code)|certain|concert|secretar|accountant|accountab).*"
}
/**