#include "SwiftExtractor.h" #include #include #include #include #include #include #include #include "swift/extractor/translators/SwiftVisitor.h" #include "swift/extractor/infra/TargetDomains.h" #include "swift/extractor/infra/file/Path.h" #include "swift/extractor/infra/SwiftLocationExtractor.h" #include "swift/extractor/infra/SwiftBodyEmissionStrategy.h" #include "swift/extractor/mangler/SwiftMangler.h" #include "swift/logging/SwiftAssert.h" using namespace codeql; using namespace std::string_literals; namespace fs = std::filesystem; Logger& main_logger::logger() { static Logger ret{"main"}; return ret; } using namespace main_logger; static void ensureDirectory(const char* label, const fs::path& dir) { std::error_code ec; fs::create_directories(dir, ec); CODEQL_ASSERT(!ec, "Cannot create {} directory ({})", label, ec); } static void archiveFile(const SwiftExtractorConfiguration& config, swift::SourceFile& file) { auto source = codeql::resolvePath(file.getFilename()); auto destination = config.sourceArchiveDir / source.relative_path(); ensureDirectory("source archive destination", destination.parent_path()); std::error_code ec; fs::copy(source, destination, fs::copy_options::overwrite_existing, ec); if (ec) { LOG_INFO( "Cannot archive source file {} -> {}, probably a harmless race with another process ({})", source, destination, ec); } } // TODO: This will be factored out/replaced with a simplified version of custom mangling static std::string mangledDeclName(const swift::ValueDecl& decl) { std::string_view moduleName = decl.getModuleContext()->getRealName().str(); // ASTMangler::mangleAnyDecl crashes when called on `ModuleDecl` if (decl.getKind() == swift::DeclKind::Module) { return std::string{moduleName}; } swift::Mangle::ASTMangler mangler; if (decl.getKind() == swift::DeclKind::TypeAlias) { // In cases like this (when coming from PCM) // typealias CFXMLTree = CFTree // typealias CFXMLTreeRef = CFXMLTree // mangleAnyDecl mangles both CFXMLTree and CFXMLTreeRef into 'So12CFXMLTreeRefa' // which is not correct and causes inconsistencies. mangleEntity makes these two distinct // prefix adds a couple of special symbols, we don't necessary need them return mangler.mangleEntity(&decl); } if (decl.getKind() == swift::DeclKind::GenericTypeParam) { // internal mangling does not distinguish generic type parameters with the same name and // position of different functions. We prepend the context (that is, the function) to // circumvent that auto context = llvm::dyn_cast(decl.getDeclContext()->getAsDecl()); assert(context); return mangledDeclName(*context) + '_' + mangler.mangleAnyDecl(&decl, /* prefix = */ false); } return mangler.mangleAnyDecl(&decl, /* prefix = */ false); } static fs::path getFilename(swift::ModuleDecl& module, swift::SourceFile* primaryFile, const swift::Decl* lazyDeclaration) { if (primaryFile) { return resolvePath(primaryFile->getFilename()); } if (lazyDeclaration) { // this code will be thrown away in the near future auto decl = llvm::dyn_cast(lazyDeclaration); CODEQL_ASSERT(decl, "not a ValueDecl"); auto mangled = mangledDeclName(*decl); // mangled name can be too long to use as a file name, so we can't use it directly mangled = picosha2::hash256_hex_string(mangled); std::string ret; ret += module.getRealName().str(); ret += '_'; ret += decl->getBaseName().userFacingName(); ret += '_'; // half a SHA2 is enough ret += std::string_view(mangled).substr(0, mangled.size() / 2); return ret; } // PCM clang module if (module.isNonSwiftModule()) { // Several modules with different names might come from .pcm (clang module) files // In this case we want to differentiate them // Moreover, pcm files may come from caches located in different directories, but are // unambiguously identified by the base file name, so we can discard the absolute directory fs::path filename = "/pcms"; filename /= fs::path{std::string_view{module.getModuleFilename()}}.filename(); filename += "-"; filename += module.getName().str(); return filename; } if (module.isBuiltinModule()) { // The Builtin module has an empty filename, let's fix that return "/__Builtin__"; } std::string_view filename = module.getModuleFilename(); // there is a special case of a module without an actual filename reporting ``: in this // case we want to avoid the `<>` characters, in case a dirty DB is imported on Windows if (filename == "") { return "/__imports__"; } return resolvePath(filename); } static llvm::SmallVector getTopLevelDecls(swift::ModuleDecl& module, swift::SourceFile* primaryFile, const swift::Decl* lazyDeclaration) { llvm::SmallVector ret; if (lazyDeclaration) { ret.push_back(lazyDeclaration); return ret; } ret.push_back(&module); llvm::SmallVector topLevelDecls; if (primaryFile) { primaryFile->getTopLevelDecls(topLevelDecls); } else { module.getTopLevelDecls(topLevelDecls); } ret.insert(ret.end(), topLevelDecls.data(), topLevelDecls.data() + topLevelDecls.size()); return ret; } static TrapType getTrapType(swift::SourceFile* primaryFile, const swift::Decl* lazyDeclaration) { if (primaryFile) { return TrapType::source; } if (lazyDeclaration) { return TrapType::lazy_declaration; } return TrapType::module; } static std::unordered_set extractDeclarations( SwiftExtractorState& state, swift::CompilerInstance& compiler, swift::ModuleDecl& module, swift::SourceFile* primaryFile, const swift::Decl* lazyDeclaration) { auto filename = getFilename(module, primaryFile, lazyDeclaration); if (primaryFile) { state.sourceFiles.push_back(filename); } // The extractor can be called several times from different processes with // the same input file(s). Using `TargetFile` the first process will win, and the following // will just skip the work const auto trapType = getTrapType(primaryFile, lazyDeclaration); auto trap = createTargetTrapDomain(state, filename, trapType); if (!trap) { // another process arrived first, nothing to do for us if (lazyDeclaration) { state.emittedDeclarations.insert(lazyDeclaration); } return {}; } std::vector comments; if (primaryFile && primaryFile->getBufferID().hasValue()) { auto& sourceManager = compiler.getSourceMgr(); auto tokens = swift::tokenize(compiler.getInvocation().getLangOptions(), sourceManager, primaryFile->getBufferID().getValue()); for (auto& token : tokens) { if (token.getKind() == swift::tok::comment) { comments.push_back(token); } } } SwiftLocationExtractor locationExtractor(*trap); locationExtractor.emitFile(primaryFile); SwiftBodyEmissionStrategy bodyEmissionStrategy(module, primaryFile, lazyDeclaration); SwiftVisitor visitor(compiler.getSourceMgr(), state, *trap, locationExtractor, bodyEmissionStrategy); auto topLevelDecls = getTopLevelDecls(module, primaryFile, lazyDeclaration); for (auto decl : topLevelDecls) { visitor.extract(decl); } for (auto& comment : comments) { visitor.extract(comment); } return std::move(visitor).getEncounteredModules(); } static std::unordered_set collectInputFilenames(swift::CompilerInstance& compiler) { // The frontend can be called in many different ways. // At each invocation we only extract system and builtin modules and any input source files that // are primary inputs, or all of them if there are no primary inputs (whole module optimization) std::unordered_set sourceFiles; const auto& inOuts = compiler.getInvocation().getFrontendOptions().InputsAndOutputs; for (auto& input : inOuts.getAllInputs()) { if (input.getType() == swift::file_types::TY_Swift && (!inOuts.hasPrimaryInputs() || input.isPrimary())) { sourceFiles.insert(input.getFileName()); } } return sourceFiles; } static std::vector collectLoadedModules(swift::CompilerInstance& compiler) { std::vector ret; for (const auto& [id, module] : compiler.getASTContext().getLoadedModules()) { std::ignore = id; ret.push_back(module); } return ret; } void codeql::extractSwiftFiles(SwiftExtractorState& state, swift::CompilerInstance& compiler) { auto inputFiles = collectInputFilenames(compiler); std::vector todo = collectLoadedModules(compiler); state.encounteredModules.insert(todo.begin(), todo.end()); while (!todo.empty()) { auto module = todo.back(); todo.pop_back(); bool isFromSourceFile = false; std::unordered_set encounteredModules; for (auto file : module->getFiles()) { auto sourceFile = llvm::dyn_cast(file); if (!sourceFile) { continue; } isFromSourceFile = true; if (inputFiles.count(sourceFile->getFilename().str()) == 0) { continue; } archiveFile(state.configuration, *sourceFile); encounteredModules = extractDeclarations(state, compiler, *module, sourceFile, /*lazy declaration*/ nullptr); } if (!isFromSourceFile) { encounteredModules = extractDeclarations(state, compiler, *module, /*source file*/ nullptr, /*lazy declaration*/ nullptr); } for (auto encountered : encounteredModules) { if (state.encounteredModules.count(encountered) == 0) { todo.push_back(encountered); state.encounteredModules.insert(encountered); } } } } static void cleanupPendingDeclarations(SwiftExtractorState& state) { std::vector worklist(std::begin(state.pendingDeclarations), std::end(state.pendingDeclarations)); for (auto decl : worklist) { if (state.emittedDeclarations.count(decl)) { state.pendingDeclarations.erase(decl); } } } static void extractLazy(SwiftExtractorState& state, swift::CompilerInstance& compiler) { cleanupPendingDeclarations(state); std::vector worklist(std::begin(state.pendingDeclarations), std::end(state.pendingDeclarations)); for (auto pending : worklist) { extractDeclarations(state, compiler, *pending->getModuleContext(), /*source file*/ nullptr, pending); } } void codeql::extractExtractLazyDeclarations(SwiftExtractorState& state, swift::CompilerInstance& compiler) { // Just in case const int upperBound = 100; int iteration = 0; while (!state.pendingDeclarations.empty()) { CODEQL_ASSERT(iteration++ < upperBound, "Swift extractor reached upper bound while extracting lazy declarations"); extractLazy(state, compiler); } }