From f0aa815a9aab51a38fa08c0dd5f0143bd91d1d2f Mon Sep 17 00:00:00 2001 From: Michael Hohn Date: Sun, 21 Nov 2021 16:42:11 -0800 Subject: [PATCH] Fix encoding read error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using : with open(fname, 'r') as file: hits the accented letter á in Vrána in the file : data/wxWidgets-small/src/stc/scintilla/lexers/LexCSS.cxx it results in a : UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe1 in position 119: invalid continuation byte We are reading source code, so we likely don't care about dropping non-ascii; using : with codecs.open(fname, 'r', encoding="latin-1") as file: ignores this problem. --- sarif_cli/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sarif_cli/__init__.py b/sarif_cli/__init__.py index 4ad726d..90927f9 100644 --- a/sarif_cli/__init__.py +++ b/sarif_cli/__init__.py @@ -1,6 +1,7 @@ import sys import os import re +import codecs MIN_PYTHON = (3, 7) if sys.version_info < MIN_PYTHON: @@ -97,7 +98,7 @@ def load_lines(root, path, line_from, line_to): if not os.path.exists(fname): dbg("Missing file: %s" % fname) return [] - with open(fname, 'r') as file: + with codecs.open(fname, 'r', encoding="latin-1") as file: lines = file.readlines() return [line.rstrip("\n\r").replace("\t", " ") for line in lines[line_from-1 : line_to-1+1]]