mirror of
https://github.com/github/codeql.git
synced 2026-04-26 01:05:15 +02:00
Merge pull request #17807 from github/tausbn/python-fix-string-encoding-dataset-check-failure
Python: Fix string encoding dataset check failure
This commit is contained in:
@@ -0,0 +1,2 @@
|
||||
"\uD800"
|
||||
"?"
|
||||
18
python/extractor/cli-integration-test/string-encoding/test.sh
Executable file
18
python/extractor/cli-integration-test/string-encoding/test.sh
Executable file
@@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -Eeuo pipefail # see https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
|
||||
|
||||
set -x
|
||||
|
||||
CODEQL=${CODEQL:-codeql}
|
||||
|
||||
SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||
cd "$SCRIPTDIR"
|
||||
|
||||
rm -rf db
|
||||
|
||||
$CODEQL database create db --language python --source-root repo_dir/
|
||||
|
||||
$CODEQL dataset check db/db-python
|
||||
|
||||
echo "Test successfully completed."
|
||||
@@ -43,6 +43,23 @@ BUILTINS_NAME = 'builtins'
|
||||
|
||||
LITERALS = (ast.Num, ast.Str)
|
||||
|
||||
# A variant of the 'replace' error handler that replaces unencodable characters with U+FFFD
|
||||
# rather than '?'. Without this, a string like '\uD800' (which is not encodable) would get mapped
|
||||
# to '?', and potentially clash with the regular string '?' if it appeared elsewhere in the source
|
||||
# code. Used in 'get_label_for_object' below. Based on code from https://peps.python.org/pep-0293/
|
||||
def fffd_replace(exc):
|
||||
if isinstance(exc, UnicodeEncodeError):
|
||||
return ((exc.end-exc.start)*u"\\ufffd", exc.end)
|
||||
elif isinstance(exc, UnicodeDecodeError):
|
||||
return (u"\\ufffd", exc.end)
|
||||
elif isinstance(exc, UnicodeTranslateError):
|
||||
return ((exc.end-exc.start)*u"\\ufffd", exc.end)
|
||||
else:
|
||||
raise TypeError("can't handle %s" % exc.__name__)
|
||||
|
||||
import codecs
|
||||
codecs.register_error("fffdreplace", fffd_replace)
|
||||
|
||||
class _CObject(object):
|
||||
'''Utility class to wrap arbitrary C objects.
|
||||
Treat all objects as unique. Rely on naming in the
|
||||
@@ -239,7 +256,7 @@ class ObjectPass(Pass):
|
||||
else:
|
||||
prefix = u"C_bytes$"
|
||||
if t is str:
|
||||
obj = obj.encode("utf8", errors='replace')
|
||||
obj = obj.encode("utf8", errors='fffdreplace')
|
||||
return prefix + hashlib.sha1(obj).hexdigest()
|
||||
if t is bytes:
|
||||
return prefix + hashlib.sha1(obj).hexdigest()
|
||||
|
||||
Reference in New Issue
Block a user