mirror of
https://github.com/github/codeql.git
synced 2025-12-17 01:03:14 +01:00
Python: Fix dataset check error for string encoding
Here's an example of one of these errors:
```
INVALID_KEY predicate py_cobjectnames(@py_cobject obj, string name)
The key set {obj} does not functionally determine all fields. Here is a
pair of tuples that agree on the key set but differ at index 1: Tuple 1
in row 63874: (72088,"u'<X>'") Tuple 2 in row 63875: (72088,"u'<?>'")
```
(Here, the substring `X` should really be the Unicode character U+FFFD,
but for some reason I'm not allowed to put that in this commit message.)
Inside the extractor, we assign IDs based on the string type (bytestring
or Unicode) and a hash of the UTF-8 encoded content of the string. In
this case, however, certain _different_ strings were receiving the same
hash, due to replacement characters in the encoding process.
In particular, we were converting unencodable characters to question
marks in one place, and to U+FFFD in another place. This caused a
discrepancy that lead to the dataset check error.
To fix this, we put in a custom error handler that always puts the
U+FFFD character in place of unencodable characters. With this, the
strings now agree, and hence there is no clash.
This commit is contained in:
@@ -43,6 +43,23 @@ BUILTINS_NAME = 'builtins'
|
||||
|
||||
LITERALS = (ast.Num, ast.Str)
|
||||
|
||||
# A variant of the 'replace' error handler that replaces unencodable characters with U+FFFD
|
||||
# rather than '?'. Without this, a string like '\uD800' (which is not encodable) would get mapped
|
||||
# to '?', and potentially clash with the regular string '?' if it appeared elsewhere in the source
|
||||
# code. Used in 'get_label_for_object' below. Based on code from https://peps.python.org/pep-0293/
|
||||
def fffd_replace(exc):
|
||||
if isinstance(exc, UnicodeEncodeError):
|
||||
return ((exc.end-exc.start)*u"\\ufffd", exc.end)
|
||||
elif isinstance(exc, UnicodeDecodeError):
|
||||
return (u"\\ufffd", exc.end)
|
||||
elif isinstance(exc, UnicodeTranslateError):
|
||||
return ((exc.end-exc.start)*u"\\ufffd", exc.end)
|
||||
else:
|
||||
raise TypeError("can't handle %s" % exc.__name__)
|
||||
|
||||
import codecs
|
||||
codecs.register_error("fffdreplace", fffd_replace)
|
||||
|
||||
class _CObject(object):
|
||||
'''Utility class to wrap arbitrary C objects.
|
||||
Treat all objects as unique. Rely on naming in the
|
||||
@@ -239,7 +256,7 @@ class ObjectPass(Pass):
|
||||
else:
|
||||
prefix = u"C_bytes$"
|
||||
if t is str:
|
||||
obj = obj.encode("utf8", errors='replace')
|
||||
obj = obj.encode("utf8", errors='fffdreplace')
|
||||
return prefix + hashlib.sha1(obj).hexdigest()
|
||||
if t is bytes:
|
||||
return prefix + hashlib.sha1(obj).hexdigest()
|
||||
|
||||
Reference in New Issue
Block a user