TRAP formatting: adopt Java's standards

* Encode dates with D"" strings
* Truncate exceedingly long string values
* Note that floats don't require any special handling
This commit is contained in:
Chris Smowton
2022-01-31 15:13:09 +00:00
committed by Ian Lynagh
parent 4adf5829e4
commit b9d8fe72f0
3 changed files with 63 additions and 5 deletions

View File

@@ -124,7 +124,7 @@ def genTable(kt, relname, columns, enum = None, kind = None, num = None, typ = N
elif db_type == 'string':
kt.write('String')
elif db_type == 'date':
kt.write('String')
kt.write('Date')
elif db_type == 'boolean':
kt.write('Boolean')
elif db_type[0] == '@':
@@ -142,11 +142,11 @@ def genTable(kt, relname, columns, enum = None, kind = None, num = None, typ = N
kt.write(comma)
if colname == kind:
kt.write(str(num))
elif db_type == 'string' or db_type == 'date':
kt.write('\\"${escapeTrapString(' + colname + ')}\\"')
elif db_type == 'string':
kt.write('\\"${escapeTrapString(truncateString(' + colname + '))}\\"')
elif db_type == 'date':
kt.write('D\\"${' + colname + '}\\"')
else:
# TODO: Any reformatting or escaping necessary?
# e.g. float formats?
kt.write('$' + colname)
comma = ', '
kt.write(')\\n")\n')
@@ -155,6 +155,7 @@ def genTable(kt, relname, columns, enum = None, kind = None, num = None, typ = N
with open('src/main/kotlin/KotlinExtractorDbScheme.kt', 'w') as kt:
kt.write('/* Generated by ' + sys.argv[0] + ': Do not edit manually. */\n')
kt.write('package com.github.codeql\n')
kt.write('import java.util.Date')
for relname, columns in tables.items():
enum = None

View File

@@ -0,0 +1,52 @@
package com.semmle.util.unicode;
public class UTF8Util {
/**
* Get the length (in Unicode code units, not code points) of the longest prefix of
* a string that can be UTF-8 encoded in no more than the given number of bytes.
*
* <p>
* Unencodable characters (such as lone surrogate halves or low surrogates
* that do not follow a high surrogate) are treated as being encoded in
* three bytes. This is safe since on encoding they will be replaced by
* a replacement character, which in turn will take at most three bytes to
* encode.
* </p>
*
* @param str string to encode
* @param maxEncodedLength maximum number of bytes for the encoded prefix
* @return length of the prefix
*/
public static int encodablePrefixLength(String str, int maxEncodedLength) {
// no character takes more than three bytes to encode
if (str.length() > maxEncodedLength / 3) {
int encodedLength = 0;
for (int i = 0; i < str.length(); ++i) {
int oldI = i;
char c = str.charAt(i);
if (c <= 0x7f) {
encodedLength += 1;
} else if (c <= 0x7ff) {
encodedLength += 2;
} else if (Character.isHighSurrogate(c)) {
// surrogate pairs take four bytes to encode
if (i+1 < str.length() && Character.isLowSurrogate(str.charAt(i+1))) {
encodedLength += 4;
++i;
} else {
// lone high surrogate, assume length three
encodedLength += 3;
}
} else {
encodedLength += 3;
}
if (encodedLength > maxEncodedLength) {
return oldI;
}
}
}
return str.length();
}
}

View File

@@ -8,6 +8,7 @@ import java.io.FileOutputStream
import java.nio.file.Files
import java.nio.file.Paths
import com.semmle.util.files.FileUtil
import com.semmle.util.unicode.UTF8Util
import kotlin.system.exitProcess
class KotlinExtractorExtension(
@@ -117,6 +118,10 @@ class FileExtractionProblems(val invocationExtractionProblems: ExtractionProblem
fun escapeTrapString(str: String) = str.replace("\"", "\"\"")
const val MAX_STRLEN = 1.shl(20) // 1 megabyte
fun truncateString(str: String) = str.substring(0, UTF8Util.encodablePrefixLength(str, MAX_STRLEN))
private fun equivalentTrap(f1: File, f2: File): Boolean {
f1.bufferedReader().use { bw1 ->
f2.bufferedReader().use { bw2 ->