Merge pull request #205 from max/trap-writer-long-strings

Teach TRAP writer to truncate strings longer than 1MiB.
This commit is contained in:
Sauyon Lee
2019-12-27 11:35:34 -08:00
committed by GitHub Enterprise
2 changed files with 86 additions and 1 deletions

View File

@@ -7,6 +7,7 @@ import (
"io/ioutil"
"os"
"path/filepath"
"unicode/utf8"
"github.com/Semmle/go/extractor/srcarchive"
"golang.org/x/tools/go/packages"
@@ -83,6 +84,22 @@ func (tw *Writer) ForEachObject(cb func(*Writer, types.Object, Label)) {
}
}
const max_strlen = 1024 * 1024
func capStringLength(s string) string {
// if the UTF8-encoded string is longer than 1MiB, we truncate it
if len(s) > max_strlen {
// to ensure that the truncated string is valid UTF-8, we find the last byte at or
// before index max_strlen that starts a UTF-8 encoded character, and then cut off
// right before that byte
end := max_strlen
for ; !utf8.RuneStart(s[end]); end-- {
}
return s[0:end]
}
return s
}
// Emit writes out a tuple of values for the given `table`
func (tw *Writer) Emit(table string, values []interface{}) error {
fmt.Fprintf(tw.w, "%s(", table)
@@ -94,7 +111,7 @@ func (tw *Writer) Emit(table string, values []interface{}) error {
case Label:
fmt.Fprint(tw.w, value.id)
case string:
fmt.Fprintf(tw.w, "\"%s\"", escapeString(value))
fmt.Fprintf(tw.w, "\"%s\"", escapeString(capStringLength(value)))
case int:
fmt.Fprintf(tw.w, "%d", value)
default:

View File

@@ -0,0 +1,68 @@
package trap
import (
"strings"
"testing"
)
const (
asciiChar = "*"
bmpChar = "\u2028"
nonBmpChar = "\U000101d0"
)
func TestCapStringLength(t *testing.T) {
// test simple cases only involving ASCII characters
short := strings.Repeat(asciiChar, max_strlen-1)
if capStringLength(short) != short {
t.Errorf("Strings shorter than maximum length should not be truncated")
}
short = strings.Repeat(asciiChar, max_strlen)
if capStringLength(short) != short {
t.Errorf("Strings no longer than maximum length should not be truncated")
}
long := strings.Repeat(asciiChar, max_strlen+1)
if capStringLength(long) != long[0:max_strlen] {
t.Errorf("Strings longer than maximum length should be truncated")
}
// test chopping off non-ASCII characters
prefix := strings.Repeat(asciiChar, max_strlen)
long = prefix + bmpChar
if capStringLength(long) != prefix {
t.Errorf("BMP character after max_strlen should be correctly chopped off")
}
prefix = strings.Repeat(asciiChar, max_strlen)
long = prefix + nonBmpChar
if capStringLength(long) != prefix {
t.Errorf("Non-BMP character after max_strlen should be correctly chopped off")
}
prefix = strings.Repeat(asciiChar, max_strlen-(len(bmpChar)-1))
long = prefix + bmpChar
if capStringLength(long) != prefix {
t.Errorf("BMP character straddling max_strlen should be correctly chopped off")
}
prefix = strings.Repeat(asciiChar, max_strlen-(len(nonBmpChar)-1))
long = prefix + nonBmpChar
if capStringLength(long) != prefix {
t.Errorf("Non-BMP character straddling max_strlen should be correctly chopped off")
}
// test preserving non-ASCII characters that just about fit
prefix = strings.Repeat(asciiChar, max_strlen-len(bmpChar))
short = prefix + bmpChar
if capStringLength(short) != short {
t.Errorf("BMP character before max_strlen should be correctly preserved")
}
prefix = strings.Repeat(asciiChar, max_strlen-len(nonBmpChar))
short = prefix + nonBmpChar
if capStringLength(short) != short {
t.Errorf("Non-BMP character before max_strlen should be correctly preserved")
}
}