Merge pull request #205 from max/trap-writer-long-strings

Teach TRAP writer to truncate strings longer than 1MiB.
2026-05-02 20:25:13 +02:00 · 2019-12-27 11:35:34 -08:00
parent 1df3585c92 121c940ace
commit 9fd7db7e43
2 changed files with 86 additions and 1 deletions
--- a/extractor/trap/trapwriter.go
+++ b/extractor/trap/trapwriter.go
@@ -7,6 +7,7 @@ import (
 	"io/ioutil"
 	"os"
 	"path/filepath"
+	"unicode/utf8"

 	"github.com/Semmle/go/extractor/srcarchive"
 	"golang.org/x/tools/go/packages"
@@ -83,6 +84,22 @@ func (tw *Writer) ForEachObject(cb func(*Writer, types.Object, Label)) {
 	}
 }

+const max_strlen = 1024 * 1024
+
+func capStringLength(s string) string {
+	// if the UTF8-encoded string is longer than 1MiB, we truncate it
+	if len(s) > max_strlen {
+		// to ensure that the truncated string is valid UTF-8, we find the last byte at or
+		// before index max_strlen that starts a UTF-8 encoded character, and then cut off
+		// right before that byte
+		end := max_strlen
+		for ; !utf8.RuneStart(s[end]); end-- {
+		}
+		return s[0:end]
+	}
+	return s
+}
+
 // Emit writes out a tuple of values for the given `table`
 func (tw *Writer) Emit(table string, values []interface{}) error {
 	fmt.Fprintf(tw.w, "%s(", table)
@@ -94,7 +111,7 @@ func (tw *Writer) Emit(table string, values []interface{}) error {
 		case Label:
 			fmt.Fprint(tw.w, value.id)
 		case string:
-			fmt.Fprintf(tw.w, "\"%s\"", escapeString(value))
+			fmt.Fprintf(tw.w, "\"%s\"", escapeString(capStringLength(value)))
 		case int:
 			fmt.Fprintf(tw.w, "%d", value)
 		default:
--- a/extractor/trap/trapwriter_test.go
+++ b/extractor/trap/trapwriter_test.go
@@ -0,0 +1,68 @@
+package trap
+
+import (
+	"strings"
+	"testing"
+)
+
+const (
+	asciiChar  = "*"
+	bmpChar    = "\u2028"
+	nonBmpChar = "\U000101d0"
+)
+
+func TestCapStringLength(t *testing.T) {
+	// test simple cases only involving ASCII characters
+	short := strings.Repeat(asciiChar, max_strlen-1)
+	if capStringLength(short) != short {
+		t.Errorf("Strings shorter than maximum length should not be truncated")
+	}
+
+	short = strings.Repeat(asciiChar, max_strlen)
+	if capStringLength(short) != short {
+		t.Errorf("Strings no longer than maximum length should not be truncated")
+	}
+
+	long := strings.Repeat(asciiChar, max_strlen+1)
+	if capStringLength(long) != long[0:max_strlen] {
+		t.Errorf("Strings longer than maximum length should be truncated")
+	}
+
+	// test chopping off non-ASCII characters
+	prefix := strings.Repeat(asciiChar, max_strlen)
+	long = prefix + bmpChar
+	if capStringLength(long) != prefix {
+		t.Errorf("BMP character after max_strlen should be correctly chopped off")
+	}
+
+	prefix = strings.Repeat(asciiChar, max_strlen)
+	long = prefix + nonBmpChar
+	if capStringLength(long) != prefix {
+		t.Errorf("Non-BMP character after max_strlen should be correctly chopped off")
+	}
+
+	prefix = strings.Repeat(asciiChar, max_strlen-(len(bmpChar)-1))
+	long = prefix + bmpChar
+	if capStringLength(long) != prefix {
+		t.Errorf("BMP character straddling max_strlen should be correctly chopped off")
+	}
+
+	prefix = strings.Repeat(asciiChar, max_strlen-(len(nonBmpChar)-1))
+	long = prefix + nonBmpChar
+	if capStringLength(long) != prefix {
+		t.Errorf("Non-BMP character straddling max_strlen should be correctly chopped off")
+	}
+
+	// test preserving non-ASCII characters that just about fit
+	prefix = strings.Repeat(asciiChar, max_strlen-len(bmpChar))
+	short = prefix + bmpChar
+	if capStringLength(short) != short {
+		t.Errorf("BMP character before max_strlen should be correctly preserved")
+	}
+
+	prefix = strings.Repeat(asciiChar, max_strlen-len(nonBmpChar))
+	short = prefix + nonBmpChar
+	if capStringLength(short) != short {
+		t.Errorf("Non-BMP character before max_strlen should be correctly preserved")
+	}
+}