Files
codeql/swift/extractor/SwiftExtractor.cpp
Paolo Tranquilli e84bdf5bed Swift: use hashing for lazy decl trap file names
It turns out mangled names can sometimes be too long. While this code
will eventually be replaced by our own mangling, we need to use hashing
to cut down the names.

Module and decl names are preserved in the trap file names for
debuggability.
2023-04-24 14:36:36 +02:00

268 lines
9.8 KiB
C++

#include "SwiftExtractor.h"
#include <iostream>
#include <memory>
#include <unordered_set>
#include <queue>
#include <swift/AST/SourceFile.h>
#include <swift/AST/Builtins.h>
#include "swift/extractor/translators/SwiftVisitor.h"
#include "swift/extractor/infra/TargetDomains.h"
#include "swift/extractor/infra/file/Path.h"
#include "swift/extractor/infra/SwiftLocationExtractor.h"
#include "swift/extractor/infra/SwiftBodyEmissionStrategy.h"
#include "swift/extractor/mangler/SwiftMangler.h"
#include <picosha2.h>
using namespace codeql;
using namespace std::string_literals;
namespace fs = std::filesystem;
static void ensureDirectory(const char* label, const fs::path& dir) {
std::error_code ec;
fs::create_directories(dir, ec);
if (ec) {
std::cerr << "Cannot create " << label << " directory: " << ec.message() << "\n";
std::abort();
}
}
static void archiveFile(const SwiftExtractorConfiguration& config, swift::SourceFile& file) {
auto source = codeql::resolvePath(file.getFilename());
auto destination = config.sourceArchiveDir / source.relative_path();
ensureDirectory("source archive destination", destination.parent_path());
std::error_code ec;
fs::copy(source, destination, fs::copy_options::overwrite_existing, ec);
if (ec) {
std::cerr << "Cannot archive source file " << source << " -> " << destination << ": "
<< ec.message() << "\n";
}
}
static fs::path getFilename(swift::ModuleDecl& module,
swift::SourceFile* primaryFile,
const swift::Decl* lazyDeclaration) {
if (primaryFile) {
return resolvePath(primaryFile->getFilename());
}
if (lazyDeclaration) {
// this code will be thrown away in the near future
SwiftMangler mangler;
auto mangled = mangler.mangledName(*lazyDeclaration);
// mangled name can be too long to use as a file name, so we can't use it directly
mangled = picosha2::hash256_hex_string(mangled);
std::string ret;
ret += module.getRealName().str();
ret += '_';
llvm::SmallVector<char> scratch;
// lazyDeclaration must be a ValueDecl, as already asserted in SwiftMangler::mangledName
ret += llvm::cast<swift::ValueDecl>(lazyDeclaration)->getName().getString(scratch);
ret += '_';
// half a SHA2 is enough
ret += std::string_view(mangled).substr(0, mangled.size() / 2);
return ret;
}
// PCM clang module
if (module.isNonSwiftModule()) {
// Several modules with different names might come from .pcm (clang module) files
// In this case we want to differentiate them
// Moreover, pcm files may come from caches located in different directories, but are
// unambiguously identified by the base file name, so we can discard the absolute directory
fs::path filename = "/pcms";
filename /= fs::path{std::string_view{module.getModuleFilename()}}.filename();
filename += "-";
filename += module.getName().str();
return filename;
}
if (module.isBuiltinModule()) {
// The Builtin module has an empty filename, let's fix that
return "/__Builtin__";
}
std::string_view filename = module.getModuleFilename();
// there is a special case of a module without an actual filename reporting `<imports>`: in this
// case we want to avoid the `<>` characters, in case a dirty DB is imported on Windows
if (filename == "<imports>") {
return "/__imports__";
}
return resolvePath(filename);
}
static llvm::SmallVector<const swift::Decl*> getTopLevelDecls(swift::ModuleDecl& module,
swift::SourceFile* primaryFile,
const swift::Decl* lazyDeclaration) {
llvm::SmallVector<const swift::Decl*> ret;
if (lazyDeclaration) {
ret.push_back(lazyDeclaration);
return ret;
}
ret.push_back(&module);
llvm::SmallVector<swift::Decl*> topLevelDecls;
if (primaryFile) {
primaryFile->getTopLevelDecls(topLevelDecls);
} else {
module.getTopLevelDecls(topLevelDecls);
}
ret.insert(ret.end(), topLevelDecls.data(), topLevelDecls.data() + topLevelDecls.size());
return ret;
}
static TrapType getTrapType(swift::SourceFile* primaryFile, const swift::Decl* lazyDeclaration) {
if (primaryFile) {
return TrapType::source;
}
if (lazyDeclaration) {
return TrapType::lazy_declaration;
}
return TrapType::module;
}
static std::unordered_set<swift::ModuleDecl*> extractDeclarations(
SwiftExtractorState& state,
swift::CompilerInstance& compiler,
swift::ModuleDecl& module,
swift::SourceFile* primaryFile,
const swift::Decl* lazyDeclaration) {
auto filename = getFilename(module, primaryFile, lazyDeclaration);
if (primaryFile) {
state.sourceFiles.push_back(filename);
}
// The extractor can be called several times from different processes with
// the same input file(s). Using `TargetFile` the first process will win, and the following
// will just skip the work
const auto trapType = getTrapType(primaryFile, lazyDeclaration);
auto trap = createTargetTrapDomain(state, filename, trapType);
if (!trap) {
// another process arrived first, nothing to do for us
if (lazyDeclaration) {
state.emittedDeclarations.insert(lazyDeclaration);
}
return {};
}
std::vector<swift::Token> comments;
if (primaryFile && primaryFile->getBufferID().hasValue()) {
auto& sourceManager = compiler.getSourceMgr();
auto tokens = swift::tokenize(compiler.getInvocation().getLangOptions(), sourceManager,
primaryFile->getBufferID().getValue());
for (auto& token : tokens) {
if (token.getKind() == swift::tok::comment) {
comments.push_back(token);
}
}
}
SwiftLocationExtractor locationExtractor(*trap);
locationExtractor.emitFile(primaryFile);
SwiftBodyEmissionStrategy bodyEmissionStrategy(module, primaryFile, lazyDeclaration);
SwiftVisitor visitor(compiler.getSourceMgr(), state, *trap, locationExtractor,
bodyEmissionStrategy);
auto topLevelDecls = getTopLevelDecls(module, primaryFile, lazyDeclaration);
for (auto decl : topLevelDecls) {
visitor.extract(decl);
}
for (auto& comment : comments) {
visitor.extract(comment);
}
return std::move(visitor).getEncounteredModules();
}
static std::unordered_set<std::string> collectInputFilenames(swift::CompilerInstance& compiler) {
// The frontend can be called in many different ways.
// At each invocation we only extract system and builtin modules and any input source files that
// are primary inputs, or all of them if there are no primary inputs (whole module optimization)
std::unordered_set<std::string> sourceFiles;
const auto& inOuts = compiler.getInvocation().getFrontendOptions().InputsAndOutputs;
for (auto& input : inOuts.getAllInputs()) {
if (input.getType() == swift::file_types::TY_Swift &&
(!inOuts.hasPrimaryInputs() || input.isPrimary())) {
sourceFiles.insert(input.getFileName());
}
}
return sourceFiles;
}
static std::vector<swift::ModuleDecl*> collectLoadedModules(swift::CompilerInstance& compiler) {
std::vector<swift::ModuleDecl*> ret;
for (const auto& [id, module] : compiler.getASTContext().getLoadedModules()) {
std::ignore = id;
ret.push_back(module);
}
return ret;
}
void codeql::extractSwiftFiles(SwiftExtractorState& state, swift::CompilerInstance& compiler) {
auto inputFiles = collectInputFilenames(compiler);
std::vector<swift::ModuleDecl*> todo = collectLoadedModules(compiler);
state.encounteredModules.insert(todo.begin(), todo.end());
while (!todo.empty()) {
auto module = todo.back();
todo.pop_back();
bool isFromSourceFile = false;
std::unordered_set<swift::ModuleDecl*> encounteredModules;
for (auto file : module->getFiles()) {
auto sourceFile = llvm::dyn_cast<swift::SourceFile>(file);
if (!sourceFile) {
continue;
}
isFromSourceFile = true;
if (inputFiles.count(sourceFile->getFilename().str()) == 0) {
continue;
}
archiveFile(state.configuration, *sourceFile);
encounteredModules =
extractDeclarations(state, compiler, *module, sourceFile, /*lazy declaration*/ nullptr);
}
if (!isFromSourceFile) {
encounteredModules = extractDeclarations(state, compiler, *module, /*source file*/ nullptr,
/*lazy declaration*/ nullptr);
}
for (auto encountered : encounteredModules) {
if (state.encounteredModules.count(encountered) == 0) {
todo.push_back(encountered);
state.encounteredModules.insert(encountered);
}
}
}
}
static void cleanupPendingDeclarations(SwiftExtractorState& state) {
std::vector<const swift::Decl*> worklist(std::begin(state.pendingDeclarations),
std::end(state.pendingDeclarations));
for (auto decl : worklist) {
if (state.emittedDeclarations.count(decl)) {
state.pendingDeclarations.erase(decl);
}
}
}
static void extractLazy(SwiftExtractorState& state, swift::CompilerInstance& compiler) {
cleanupPendingDeclarations(state);
std::vector<const swift::Decl*> worklist(std::begin(state.pendingDeclarations),
std::end(state.pendingDeclarations));
for (auto pending : worklist) {
extractDeclarations(state, compiler, *pending->getModuleContext(), /*source file*/ nullptr,
pending);
}
}
void codeql::extractExtractLazyDeclarations(SwiftExtractorState& state,
swift::CompilerInstance& compiler) {
// Just in case
const int upperBound = 100;
int iteration = 0;
while (!state.pendingDeclarations.empty() && iteration++ < upperBound) {
extractLazy(state, compiler);
}
if (iteration >= upperBound) {
std::cerr << "Swift extractor reached upper bound while extracting lazy declarations\n";
abort();
}
}