Extract comments (based on C# comments extraction with element stack)

This commit is contained in:
Tamas Vajk
2021-09-07 14:44:06 +02:00
committed by Ian Lynagh
parent fd8dd21f75
commit 1c8be155c9
7 changed files with 289 additions and 17 deletions

View File

@@ -1,29 +1,26 @@
package com.github.codeql
import com.github.codeql.comments.CommentExtractor
import org.jetbrains.kotlin.backend.common.extensions.IrGenerationExtension
import org.jetbrains.kotlin.backend.common.extensions.IrPluginContext
import org.jetbrains.kotlin.descriptors.ClassKind
import org.jetbrains.kotlin.ir.IrElement
import org.jetbrains.kotlin.ir.IrStatement
import org.jetbrains.kotlin.ir.declarations.*
import org.jetbrains.kotlin.ir.expressions.*
import org.jetbrains.kotlin.ir.expressions.IrStatementOrigin.*
import org.jetbrains.kotlin.ir.symbols.IrClassifierSymbol
import org.jetbrains.kotlin.ir.types.*
import org.jetbrains.kotlin.ir.util.packageFqName
import org.jetbrains.kotlin.ir.util.render
import java.io.File
import java.io.FileOutputStream
import java.io.PrintWriter
import java.io.StringWriter
import java.nio.file.Files
import java.nio.file.Path
import java.nio.file.Paths
import java.util.Optional
import java.util.*
import kotlin.system.exitProcess
import org.jetbrains.kotlin.backend.common.extensions.IrGenerationExtension
import org.jetbrains.kotlin.backend.common.extensions.IrPluginContext
import org.jetbrains.kotlin.ir.declarations.*
import org.jetbrains.kotlin.ir.util.dump
import org.jetbrains.kotlin.ir.util.IdSignature
import org.jetbrains.kotlin.ir.util.packageFqName
import org.jetbrains.kotlin.ir.util.render
import org.jetbrains.kotlin.ir.visitors.IrElementVisitor
import org.jetbrains.kotlin.ir.IrFileEntry
import org.jetbrains.kotlin.ir.types.*
import org.jetbrains.kotlin.ir.expressions.*
import org.jetbrains.kotlin.ir.expressions.IrStatementOrigin.*
import org.jetbrains.kotlin.ir.IrStatement
import org.jetbrains.kotlin.ir.symbols.IrClassifierSymbol
import org.jetbrains.kotlin.descriptors.ClassKind
class KotlinExtractorExtension(private val invocationTrapFile: String, private val checkTrapIdentical: Boolean) : IrGenerationExtension {
override fun generate(moduleFragment: IrModuleFragment, pluginContext: IrPluginContext) {
@@ -155,6 +152,9 @@ fun <T> fakeLabel(): Label<T> {
}
class KotlinFileExtractor(val logger: FileLogger, val tw: FileTrapWriter, val file: IrFile) {
private val commentExtractor: CommentExtractor = CommentExtractor(logger, tw, file)
val fileClass by lazy {
extractFileClass(file)
}
@@ -164,8 +164,11 @@ class KotlinFileExtractor(val logger: FileLogger, val tw: FileTrapWriter, val fi
val pkgId = extractPackage(pkg)
tw.writeCupackage(id, pkgId)
file.declarations.map { extractDeclaration(it, Optional.empty()) }
commentExtractor.extract()
commentExtractor.bindCommentsToElement()
}
fun extractFileClass(f: IrFile): Label<out DbClass> {
val fileName = f.fileEntry.name
val pkg = f.fqName.asString()
@@ -291,6 +294,7 @@ class KotlinFileExtractor(val logger: FileLogger, val tw: FileTrapWriter, val fi
}
fun extractClass(c: IrClass): Label<out DbClassorinterface> {
commentExtractor.addPossibleCommentOwner(c)
val id = addClassLabel(c)
val locId = tw.getLocation(c)
val pkg = c.packageFqName?.asString() ?: ""
@@ -388,6 +392,7 @@ class KotlinFileExtractor(val logger: FileLogger, val tw: FileTrapWriter, val fi
}
fun extractFunction(f: IrFunction, parentid: Label<out DbReftype>) {
commentExtractor.addPossibleCommentOwner(f)
val id = useFunction(f)
val locId = tw.getLocation(f)
val signature = "TODO"
@@ -411,6 +416,7 @@ class KotlinFileExtractor(val logger: FileLogger, val tw: FileTrapWriter, val fi
}
fun extractProperty(p: IrProperty, parentid: Label<out DbReftype>) {
commentExtractor.addPossibleCommentOwner(p)
val bf = p.backingField
if(bf == null) {
logger.warnElement(Severity.ErrorSevere, "IrProperty without backing field", p)
@@ -424,6 +430,7 @@ class KotlinFileExtractor(val logger: FileLogger, val tw: FileTrapWriter, val fi
}
fun extractBody(b: IrBody, callable: Label<out DbCallable>) {
commentExtractor.addPossibleCommentOwner(b)
when(b) {
is IrBlockBody -> extractBlockBody(b, callable, callable, 0)
else -> logger.warnElement(Severity.ErrorSevere, "Unrecognised IrBody: " + b.javaClass, b)
@@ -460,6 +467,7 @@ class KotlinFileExtractor(val logger: FileLogger, val tw: FileTrapWriter, val fi
}
fun extractStatement(s: IrStatement, callable: Label<out DbCallable>, parent: Label<out DbStmtparent>, idx: Int) {
commentExtractor.addPossibleCommentOwner(s)
when(s) {
is IrExpression -> {
extractExpression(s, callable, parent, idx)
@@ -585,6 +593,7 @@ class KotlinFileExtractor(val logger: FileLogger, val tw: FileTrapWriter, val fi
}
fun extractExpression(e: IrExpression, callable: Label<out DbCallable>, parent: Label<out DbExprparent>, idx: Int) {
commentExtractor.addPossibleCommentOwner(e)
when(e) {
is IrCall -> extractCall(e, callable, parent, idx)
is IrConst<*> -> {

View File

@@ -0,0 +1,14 @@
package com.github.codeql
import org.jetbrains.kotlin.ir.IrElement
data class Location(val startOffset: Int, val endOffset: Int){
fun contains(location: Location) : Boolean {
return this.startOffset <= location.startOffset &&
this.endOffset >= location.endOffset
}
}
fun IrElement.getLocation() : Location {
return Location(this.startOffset, this.endOffset)
}

View File

@@ -0,0 +1,13 @@
package com.github.codeql.comments
import com.github.codeql.Location
data class Comment(val rawText: String, val startOffset: Int, val endOffset: Int, val type: CommentType){
fun getLocation() : Location {
return Location(this.startOffset, this.endOffset)
}
override fun toString(): String {
return "Comment: $rawText [$startOffset-$endOffset]"
}
}

View File

@@ -0,0 +1,8 @@
package com.github.codeql.comments
enum class CommentBinding { // from C#
Parent, // The parent element of a comment
Best, // The most likely element associated with a comment
Before, // The element before the comment
After // The element after the comment
}

View File

@@ -0,0 +1,176 @@
package com.github.codeql.comments
import com.github.codeql.FileLogger
import com.github.codeql.Logger
import com.github.codeql.Severity
import com.github.codeql.TrapWriter
import com.intellij.psi.PsiComment
import com.intellij.psi.PsiElement
import org.jetbrains.kotlin.backend.common.psi.PsiSourceManager
import org.jetbrains.kotlin.backend.jvm.ir.getKtFile
import org.jetbrains.kotlin.ir.IrElement
import org.jetbrains.kotlin.ir.declarations.IrFile
import org.jetbrains.kotlin.ir.declarations.path
import org.jetbrains.kotlin.ir.util.dump
import org.jetbrains.kotlin.kdoc.psi.api.KDoc
import org.jetbrains.kotlin.lexer.KtTokens
import org.jetbrains.kotlin.psi.KtDeclaration
import org.jetbrains.kotlin.psi.KtVisitor
import org.jetbrains.kotlin.psi.findDocComment.findDocComment
import org.jetbrains.kotlin.psi.psiUtil.endOffset
import org.jetbrains.kotlin.psi.psiUtil.startOffset
import org.jetbrains.kotlin.utils.addToStdlib.cast
class CommentExtractor(private val logger: FileLogger, private val tw: TrapWriter, private val file: IrFile) {
private val ktFile = file.getKtFile()
private val comments = mutableListOf<Comment>()
private val elements = mutableListOf<IrElement>()
init {
if (ktFile == null) {
logger.warn(Severity.Warn, "Comments are not being processed in ${file.path}.")
}
}
fun addPossibleCommentOwner(elem: IrElement) {
if (ktFile == null) {
return
}
if (elem.startOffset == -1 || elem.endOffset == -1) {
logger.info("Skipping element with negative offsets: ${elem.dump()}")
return
}
val psiElement = PsiSourceManager.findPsiElement(elem, file)
if (psiElement != null) {
println("PSI: $psiElement for ${elem.dump()}")
if (psiElement is KtDeclaration) {
val docComment = findDocComment(psiElement)
if (docComment != null) {
println("doc comment: ${docComment.text}")
}
}
}
elements.add(elem)
}
/**
* Match comments to program elements.
*/
fun bindCommentsToElement() {
if (comments.isEmpty()) {
return
}
comments.sortBy { it.startOffset }
elements.sortBy { it.startOffset }
var commentIndex: Int = 0
var elementIndex: Int = 0
val elementStack: ElementStack = ElementStack()
while (elementIndex < elements.size) {
val nextElement = elements[elementIndex]
val commentsForElement = mutableListOf<Comment>()
while (commentIndex < comments.size &&
comments[commentIndex].endOffset < nextElement.startOffset) {
commentsForElement.add(comments[commentIndex])
commentIndex++
}
bindCommentsToElements(commentsForElement, elementStack, nextElement)
elementStack.push(nextElement)
elementIndex++
}
// Comments after last element
val commentsForElement = mutableListOf<Comment>()
while (commentIndex < comments.size) {
commentsForElement.add(comments[commentIndex])
commentIndex++
}
bindCommentsToElements(commentsForElement, elementStack, null)
}
/**
* Bind selected comments to elements. Elements are selected from the element stack or from the next element.
*/
private fun bindCommentsToElements(
commentsForElement: Collection<Comment>,
elementStack: ElementStack,
nextElement: IrElement?
) {
if (commentsForElement.any()) {
for (comment in commentsForElement) {
println("Comment: $comment")
val parent = elementStack.findParent(comment.getLocation())
println("parent: ${parent?.dump()}")
val before = elementStack.findBefore(comment.getLocation())
println("before: ${before?.dump()}")
val after = elementStack.findAfter(comment.getLocation(), nextElement)
println("after: ${after?.dump()}")
// todo: best match
}
}
// todo write matches to DB: tw.writeHasJavadoc()
}
fun extract() {
ktFile?.accept(
object : KtVisitor<Unit, Unit>() {
override fun visitElement(element: PsiElement) {
element.acceptChildren(this)
// Slightly hacky, but `visitComment` doesn't seem to visit comments with `tokenType` `KtTokens.DOC_COMMENT`
if (element is PsiComment){
visitCommentElement(element)
}
}
private fun visitCommentElement(comment: PsiComment) {
// val loc = tw.getLocation(comment.startOffset, comment.endOffset)
// val id: Label<DbJavadoc> = tw.getLabelFor(";comment")
// tw.writeJavadoc(id)
val type: CommentType = when (comment.tokenType) {
KtTokens.EOL_COMMENT -> {
CommentType.SingleLine
}
KtTokens.BLOCK_COMMENT -> {
CommentType.Block
}
KtTokens.DOC_COMMENT -> {
CommentType.Doc
}
else -> {
logger.warn(Severity.Warn, "Unhandled comment token type: ${comment.tokenType}")
return
}
}
if (comment.tokenType == KtTokens.DOC_COMMENT)
{
val kdoc = comment.cast<KDoc>()
for (sec in kdoc.getAllSections())
println("section content: ${sec.getContent()}")
}
comments.add(Comment(comment.text, comment.startOffset, comment.endOffset, type))
// todo:
// - store each comment in the DB
// - do further processing on Doc comments (extract @tag text, @tag name text, @tag[name] text)
}
})
}
}

View File

@@ -0,0 +1,5 @@
package com.github.codeql.comments
enum class CommentType {
SingleLine, Block, Doc
}

View File

@@ -0,0 +1,47 @@
package com.github.codeql.comments
import com.github.codeql.Location
import com.github.codeql.getLocation
import org.jetbrains.kotlin.ir.IrElement
import java.util.ArrayDeque
/**
* Stack of elements, where each element in the stack fully contains the elements above it.
*/
class ElementStack {
private val stack = ArrayDeque<IrElement>()
/**
* Pops all elements from the stack that don't fully contain the new element. And then pushes the element onto the
* stack.
*/
fun push(element: IrElement) {
while (!stack.isEmpty() && !stack.peek().getLocation().contains(element.getLocation())) {
stack.pop();
}
stack.push(element);
}
fun findBefore(location: Location) : IrElement? {
return stack.lastOrNull { it.getLocation().endOffset < location.startOffset }
}
fun findAfter(location: Location, next: IrElement?) : IrElement? {
if (next == null) {
return null
}
val parent = findParent(location) ?: return next;
if (parent.getLocation().contains(next.getLocation())) {
return next
}
return null
}
fun findParent(location: Location) : IrElement? {
return stack.firstOrNull { it.getLocation().contains(location) }
}
}