From f338ae66cb1aa339179daf5e013fdd6c8c52cebf Mon Sep 17 00:00:00 2001 From: kamilongus Date: Tue, 16 Apr 2019 19:08:12 +0400 Subject: [PATCH 1/7] Store repository index in database instead of file system Issue: https://github.com/aelve/codesearch/issues/250 --- .../codesearch/core/index/LanguageIndex.scala | 9 ++- .../core/index/repository/Downloader.scala | 1 - .../core/meta/HackageMetaDownloader.scala | 55 ++++++++++++++++++- .../core/meta/NpmMetaDownloader.scala | 2 +- project/Builder.scala | 5 +- scripts/deploy.sh | 9 --- 6 files changed, 62 insertions(+), 19 deletions(-) delete mode 100755 scripts/deploy.sh diff --git a/core/src/main/scala/codesearch/core/index/LanguageIndex.scala b/core/src/main/scala/codesearch/core/index/LanguageIndex.scala index d3d5d33..5ae08df 100644 --- a/core/src/main/scala/codesearch/core/index/LanguageIndex.scala +++ b/core/src/main/scala/codesearch/core/index/LanguageIndex.scala @@ -15,6 +15,7 @@ import codesearch.core.index.repository._ import codesearch.core.model.DefaultTable import codesearch.core.syntax.stream._ import fs2.Stream +import fs2.Chunk import fs2.io.file import fs2.text.utf8Encode import io.chrisdavenport.log4cats.SelfAwareStructuredLogger @@ -61,7 +62,7 @@ trait LanguageIndex[A <: DefaultTable] { .covary[IO] .map(_.toString + "\n") .through(utf8Encode) - .to(file.writeAll(cindexDir.dirsToIndex[NioPath], BlockingEC, List(CREATE, TRUNCATE_EXISTING))) + .through(file.writeAll(cindexDir.dirsToIndex[NioPath], BlockingEC, List(CREATE, TRUNCATE_EXISTING))) .compile .drain _ <- IO(Process(args, None, env: _*) !) @@ -91,8 +92,10 @@ trait LanguageIndex[A <: DefaultTable] { * @return count of updated packages */ def updatePackages(limit: Option[Int]): IO[Int] = { - val packages: Stream[IO, (String, String)] = getLastVersions.filterNotM { - case (packageName, packageVersion) => packageIsExists(packageName, packageVersion) + val chunkSize = 10000 + val packages: Stream[IO, (String, String)] = getLastVersions.chunkN(chunkSize).flat.filterNotM { + case (packageName, packageVersion) => + packageIsExists(packageName, packageVersion) } logger.debug("UPDATE PACKAGES") >> limit diff --git a/core/src/main/scala/codesearch/core/index/repository/Downloader.scala b/core/src/main/scala/codesearch/core/index/repository/Downloader.scala index 38c3bff..fc05a16 100644 --- a/core/src/main/scala/codesearch/core/index/repository/Downloader.scala +++ b/core/src/main/scala/codesearch/core/index/repository/Downloader.scala @@ -1,6 +1,5 @@ package codesearch.core.index.repository -import java.io.File import java.nio.ByteBuffer import java.nio.file.Path import java.nio.file.StandardOpenOption.{CREATE, TRUNCATE_EXISTING} diff --git a/core/src/main/scala/codesearch/core/meta/HackageMetaDownloader.scala b/core/src/main/scala/codesearch/core/meta/HackageMetaDownloader.scala index d114367..d24f2e4 100644 --- a/core/src/main/scala/codesearch/core/meta/HackageMetaDownloader.scala +++ b/core/src/main/scala/codesearch/core/meta/HackageMetaDownloader.scala @@ -1,19 +1,25 @@ package codesearch.core.meta +import java.io.InputStream + import cats.Monad -import cats.effect.Sync +import cats.effect.{ContextShift, Sync} import cats.syntax.flatMap._ import cats.syntax.functor._ +import codesearch.core.BlockingEC import codesearch.core.config.HaskellConfig import codesearch.core.index.repository.Downloader import codesearch.core.util.Unarchiver import com.softwaremill.sttp._ +import fs2.io.file +import fs2.{Pipe, Stream} import io.chrisdavenport.log4cats.Logger import io.chrisdavenport.log4cats.slf4j.Slf4jLogger +import org.apache.commons.compress.archivers.tar.{TarArchiveEntry, TarArchiveInputStream} import org.rauschig.jarchivelib.ArchiveFormat.TAR import org.rauschig.jarchivelib.CompressionType.GZIP -class HackageMetaDownloader[F[_]: Monad]( +class HackageMetaDownloader[F[_]: Monad: Sync: ContextShift]( config: HaskellConfig, unarchiver: Unarchiver[F], downloader: Downloader[F], @@ -27,10 +33,53 @@ class HackageMetaDownloader[F[_]: Monad]( _ <- unarchiver.extract(config.repoArchivePath, config.repoPath, TAR, GZIP) _ <- logger.info("Downloading finished") } yield () + + def storeMeta: F[Unit] = { + file + .readAll[F](config.repoArchivePath, BlockingEC, chunkSize = 4096) + .through(tarEntries) + .through(flatPackages) + } + + private final case class Package(name: String, version: String) + + private def tarEntries: Pipe[F, InputStream, TarArchiveEntry] = { input => + input.flatMap { inputStream => + Stream + .bracket(Sync[F].delay(new TarArchiveInputStream(inputStream)))(tarStream => Sync[F].delay(tarStream.close())) + .flatMap { tarStream => + Stream.unfoldEval[F, TarArchiveInputStream, TarArchiveEntry](tarStream) { tarStream => + Sync[F].delay { + tarStream.getNextTarEntry match { + case entry: TarArchiveEntry => Some(entry, tarStream) + case _ => None + } + } + } + } + } + } + + private def flatPackages: Pipe[F, TarArchiveEntry, Package] = { input => + input.flatMap { entry => + val parentName = entry.getName + val nestedEntries = entry.getDirectoryEntries + Stream.emits(nestedEntries.map(nested => Package(parentName, nested.getName))) + } + } + + private def store: Pipe[F, Package, Unit] = { input => + val batchSize = 10000 + input.chunkN(batchSize).map { packages => + val packagesBatch = packages.toList + + } //etc + } + } object HackageMetaDownloader { - def apply[F[_]: Sync]( + def apply[F[_]: Sync: ContextShift]( config: HaskellConfig, unarchiver: Unarchiver[F], downloader: Downloader[F] diff --git a/core/src/main/scala/codesearch/core/meta/NpmMetaDownloader.scala b/core/src/main/scala/codesearch/core/meta/NpmMetaDownloader.scala index 5f81d64..df85a83 100644 --- a/core/src/main/scala/codesearch/core/meta/NpmMetaDownloader.scala +++ b/core/src/main/scala/codesearch/core/meta/NpmMetaDownloader.scala @@ -40,7 +40,7 @@ class NpmMetaDownloader[F[_]: Sync: ContextShift]( .through(decoder[NpmRegistryPackage]) .map(_.asJson.noSpaces + "\n") .through(utf8Encode) - .to(file.writeAll(config.repoJsonPath, BlockingEC, List(CREATE, TRUNCATE_EXISTING))) + .through(file.writeAll(config.repoJsonPath, BlockingEC, List(CREATE, TRUNCATE_EXISTING))) .compile .drain _ <- logger.info("Downloading finished") diff --git a/project/Builder.scala b/project/Builder.scala index e39d39e..f211e3e 100644 --- a/project/Builder.scala +++ b/project/Builder.scala @@ -65,8 +65,8 @@ object Builder { "com.typesafe.slick" %% "slick-hikaricp" % "3.2.3", "org.postgresql" % "postgresql" % "42.2.2", "com.softwaremill.sttp" %% "async-http-client-backend-fs2" % "1.3.8", - "co.fs2" %% "fs2-core" % "1.0.0", - "co.fs2" %% "fs2-io" % "1.0.0", + "co.fs2" %% "fs2-core" % "1.0.4", + "co.fs2" %% "fs2-io" % "1.0.4", "io.circe" %% "circe-fs2" % "0.10.0", "io.circe" %% "circe-core" % "0.10.0", "io.circe" %% "circe-generic" % "0.10.0", @@ -75,6 +75,7 @@ object Builder { "com.github.pureconfig" %% "pureconfig" % "0.9.2", "com.github.pureconfig" %% "pureconfig-cats-effect" % "0.9.2", "io.chrisdavenport" %% "log4cats-slf4j" % "0.2.0-RC2", + "org.apache.commons" % "commons-compress" % "1.18", "org.scalactic" %% "scalactic" % "3.0.5", "org.scalatest" %% "scalatest" % "3.0.5" % "test" ), diff --git a/scripts/deploy.sh b/scripts/deploy.sh deleted file mode 100755 index 9ba8a8c..0000000 --- a/scripts/deploy.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -ssh root@167.99.88.190 " - set -x - cd /root/aelve/codesearch - git pull - sbt web-server/assembly - systemctl restart codesearch.service -" From 1d5c496f66248aa89dfcf76bf90e9d06dd364667 Mon Sep 17 00:00:00 2001 From: kamilongus Date: Sun, 21 Apr 2019 23:50:52 +0400 Subject: [PATCH 2/7] some refactoring Issue: https://github.com/aelve/codesearch/issues/250 --- core/src/main/resources/application.conf | 12 +- .../V1555715509__create_schema.sql | 0 .../src/main/scala/codesearch/core/Main.scala | 46 ++++--- .../main/scala/codesearch/core/Program.scala | 42 +++---- .../scala/codesearch/core/config/Config.scala | 43 ++----- .../scala/codesearch/core/db/DefaultDB.scala | 4 +- .../codesearch/core/db/FlywayMigration.scala | 19 +++ .../scala/codesearch/core/db/Transactor.scala | 24 ++++ .../core/db/repository/PackageIndexRep.scala | 33 +++++ .../core/db/repository/PackageRep.scala | 35 ++++++ .../codesearch/core/index/LanguageIndex.scala | 46 +++---- .../core/index/details/NpmDetails.scala | 39 ------ .../core/index/indexer/HaskellIndexer.scala | 15 +++ .../core/index/indexer/Indexer.scala | 84 +++++++++++++ .../index/indexer/JavaScriptIndexer.scala | 15 +++ .../core/index/indexer/RubyIndexer.scala | 15 +++ .../core/index/indexer/RustIndexer.scala | 15 +++ .../core/meta/CratesMetaDownloader.scala | 2 +- .../core/meta/GemMetaDownloader.scala | 2 +- .../core/meta/HackageMetaDownloader.scala | 94 +++----------- .../codesearch/core/meta/MetaDownloader.scala | 45 ++++++- .../core/meta/NpmMetaDownloader.scala | 115 +++--------------- .../unarchiver/HaskellIndexUnarchiver.scala | 53 ++++++++ .../unarchiver/JavaScriptUnarchiver.scala | 82 +++++++++++++ .../meta/unarchiver/RustIndexUnarchiver.scala | 53 ++++++++ .../unarchiver/StreamIndexUnarchiver.scala | 8 ++ .../codesearch/core/model/CratesTable.scala | 2 +- .../codesearch/core/model/DefaultTable.scala | 10 +- .../codesearch/core/model/GemTable.scala | 2 +- .../codesearch/core/model/HackageTable.scala | 2 +- .../codesearch/core/model/NpmTable.scala | 2 +- .../scala/codesearch/core/model/Version.scala | 5 +- .../codesearch/core/util/manatki/Raise.scala | 16 +++ .../core/util/manatki/syntax/raise.scala | 30 +++++ project/Builder.scala | 8 +- 35 files changed, 687 insertions(+), 331 deletions(-) create mode 100644 core/src/main/resources/db.migration/V1555715509__create_schema.sql create mode 100644 core/src/main/scala/codesearch/core/db/FlywayMigration.scala create mode 100644 core/src/main/scala/codesearch/core/db/Transactor.scala create mode 100644 core/src/main/scala/codesearch/core/db/repository/PackageIndexRep.scala create mode 100644 core/src/main/scala/codesearch/core/db/repository/PackageRep.scala delete mode 100644 core/src/main/scala/codesearch/core/index/details/NpmDetails.scala create mode 100644 core/src/main/scala/codesearch/core/index/indexer/HaskellIndexer.scala create mode 100644 core/src/main/scala/codesearch/core/index/indexer/Indexer.scala create mode 100644 core/src/main/scala/codesearch/core/index/indexer/JavaScriptIndexer.scala create mode 100644 core/src/main/scala/codesearch/core/index/indexer/RubyIndexer.scala create mode 100644 core/src/main/scala/codesearch/core/index/indexer/RustIndexer.scala create mode 100644 core/src/main/scala/codesearch/core/meta/unarchiver/HaskellIndexUnarchiver.scala create mode 100644 core/src/main/scala/codesearch/core/meta/unarchiver/JavaScriptUnarchiver.scala create mode 100644 core/src/main/scala/codesearch/core/meta/unarchiver/RustIndexUnarchiver.scala create mode 100644 core/src/main/scala/codesearch/core/meta/unarchiver/StreamIndexUnarchiver.scala create mode 100644 core/src/main/scala/codesearch/core/util/manatki/Raise.scala create mode 100644 core/src/main/scala/codesearch/core/util/manatki/syntax/raise.scala diff --git a/core/src/main/resources/application.conf b/core/src/main/resources/application.conf index 412b6c3..0a087ad 100644 --- a/core/src/main/resources/application.conf +++ b/core/src/main/resources/application.conf @@ -19,27 +19,23 @@ db { languagesConfig { haskell { + repository = "hackage" repoIndexUrl = "http://hackage.haskell.org/packages/index.tar.gz" - repoArchivePath = "./data/meta/haskell/index.tar.gz" - repoPath = "./data/meta/haskell/" concurrentTasksCount = 30 } rust { + repository = "crates" repoIndexUrl = "https://github.com/rust-lang/crates.io-index/archive/master.zip" - repoArchivePath = "./data/meta/rust/archive.zip" - repoPath = "./data/meta/rust/" concurrentTasksCount = 30 } ruby { + repository = "gem" repoIndexUrl = "http://rubygems.org/latest_specs.4.8.gz" - repoArchivePath = "./data/meta/ruby/ruby_index.gz" - repoJsonPath = "./data/meta/ruby/ruby_index.json" - scriptPath = "./scripts/update_index.rb" concurrentTasksCount = 30 } javascript { + repository = "npm" repoIndexUrl = "https://replicate.npmjs.com/_all_docs?include_docs=true" - repoJsonPath = "./data/meta/npm/npm_packages_index.json" concurrentTasksCount = 30 } } diff --git a/core/src/main/resources/db.migration/V1555715509__create_schema.sql b/core/src/main/resources/db.migration/V1555715509__create_schema.sql new file mode 100644 index 0000000..e69de29 diff --git a/core/src/main/scala/codesearch/core/Main.scala b/core/src/main/scala/codesearch/core/Main.scala index 576103c..6012ff2 100644 --- a/core/src/main/scala/codesearch/core/Main.scala +++ b/core/src/main/scala/codesearch/core/Main.scala @@ -22,33 +22,39 @@ object Main extends IOApp { lang: String = "all" ) - case class LangRep[A <: DefaultTable]( + case class LangRep[A, F[_]]( db: DefaultDB[A], langIndex: LanguageIndex[A], - metaDownloader: MetaDownloader[IO] + metaDownloader: MetaDownloader[F] ) - def run(args: List[String]): IO[ExitCode] = + def run(args: List[String]): IO[ExitCode] = { Resource.make(IO(AsyncHttpClientFs2Backend[IO]()))(client => IO(client.close())).use { implicit httpClient => for { - params <- CLI.params(args) config <- Config.load[IO] - - unarchiver = Unarchiver[IO] - implicit0(downloader: Downloader[IO]) = Downloader.create[IO] - - hackageMeta <- HackageMetaDownloader(config.languagesConfig.haskell, unarchiver, downloader) - cratesMeta <- CratesMetaDownloader(config.languagesConfig.rust, unarchiver, downloader) - gemMeta <- GemMetaDownloader(config.languagesConfig.ruby, downloader) - npmMeta <- NpmMetaDownloader(config.languagesConfig.javascript, downloader) - - langReps = Map( - "haskell" -> LangRep[HackageTable](HackageDB, HaskellIndex(config), hackageMeta), - "rust" -> LangRep[CratesTable](CratesDB, RustIndex(config), cratesMeta), - "ruby" -> LangRep[GemTable](GemDB, RubyIndex(config), gemMeta), - "javascript" -> LangRep[NpmTable](NpmDB, JavaScriptIndex(config), npmMeta) - ) - exitCode <- Program(langReps) >>= (_.run(params)) + _ <- FlywayMigration.migrate[IO](config.db) + exitCode <- Transactor.create[IO](config.db).use { xa => + for { + params <- CLI.params(args) + + unarchiver = Unarchiver[IO] + implicit0(downloader: Downloader[IO]) = Downloader.create[IO] + + hackageMeta <- HackageMetaDownloader(config.languagesConfig.haskell, unarchiver, downloader, xa) + cratesMeta <- CratesMetaDownloader(config.languagesConfig.rust, unarchiver, downloader) + gemMeta <- GemMetaDownloader(config.languagesConfig.ruby, downloader) + npmMeta <- NpmMetaDownloader(config.languagesConfig.javascript, downloader) + + langReps = Map( + "haskell" -> LangRep[HackageTable, IO](HackageDB, HaskellIndex(config), hackageMeta), + "rust" -> LangRep[CratesTable, IO](CratesDB, RustIndex(config), cratesMeta), + "ruby" -> LangRep[GemTable, IO](GemDB, RubyIndex(config), gemMeta), + "javascript" -> LangRep[NpmTable, IO](NpmDB, JavaScriptIndex(config), npmMeta) + ) + exitCode <- Program(langReps) >>= (_.run(params)) + } yield exitCode + } } yield exitCode } + } } diff --git a/core/src/main/scala/codesearch/core/Program.scala b/core/src/main/scala/codesearch/core/Program.scala index 514191a..8f92338 100644 --- a/core/src/main/scala/codesearch/core/Program.scala +++ b/core/src/main/scala/codesearch/core/Program.scala @@ -3,16 +3,24 @@ package codesearch.core import cats.effect._ import cats.instances.list._ import cats.syntax.applicative._ +import cats.syntax.flatMap._ import cats.syntax.foldable._ import cats.syntax.traverse._ +import cats.syntax.functor._ import codesearch.core.Main.{LangRep, Params} import codesearch.core.model.DefaultTable import io.chrisdavenport.log4cats.Logger +import codesearch.core.util.manatki.syntax.raise._ import io.chrisdavenport.log4cats.slf4j.Slf4jLogger -class Program(langReps: Map[String, LangRep[_ <: DefaultTable]], logger: Logger[IO]) { +case class InvalidLang(lang: String) extends RuntimeException(s"Unsupported language $lang") - def run(params: Params): IO[ExitCode] = +class Program[F[_]: Sync: ContextShift]( + langReps: Map[String, LangRep[_ <: DefaultTable]], + logger: Logger[F] +) { + + def run(params: Params): F[ExitCode] = for { _ <- if (params.lang == "all") { logger.info("Codesearch-core started for all supported languages") @@ -20,47 +28,38 @@ class Program(langReps: Map[String, LangRep[_ <: DefaultTable]], logger: Logger[ logger.info(s"Codesearch-core started for language ${params.lang}") } - _ <- initDb(params).whenA(params.initDB) _ <- downloadMeta(params).whenA(params.downloadMeta) _ <- updatePackages(params).whenA(params.updatePackages) _ <- buildIndex(params).whenA(params.buildIndex) } yield ExitCode.Success - object InvalidLang extends RuntimeException(s"Unsupported language") - - def findRepositories(lang: String): IO[List[LangRep[_]]] = { + def findRepositories(lang: String): F[List[LangRep[_]]] = { if (lang == "all") { - IO.pure(langReps.values.toList) + langReps.values.toList.pure[F].widen } else { langReps.get(lang) match { - case Some(l) => IO.pure(List(l)) - case None => IO.raiseError(InvalidLang) + case Some(l) => List(l).pure[F].widen + case None => InvalidLang(lang).raise } } } - def initDb(params: Params): IO[Unit] = - for { - languages <- findRepositories(params.lang) - _ <- languages.traverse_(_.db.initDB) - } yield () - - def downloadMeta(params: Params): IO[Unit] = { + def downloadMeta(params: Params): F[Unit] = { for { languages <- findRepositories(params.lang) - _ <- languages.traverse_(_.metaDownloader.downloadMeta) + _ <- languages.traverse_(_.metaDownloader.download) } yield () } - def updatePackages(params: Params): IO[Unit] = + def updatePackages(params: Params): F[Unit] = for { languages <- findRepositories(params.lang) updated <- languages.traverse(_.langIndex.updatePackages(params.limitedCountPackages)) _ <- logger.info(s"Updated: ${updated.sum}") } yield () - def buildIndex(params: Params): IO[Unit] = + def buildIndex(params: Params): F[Unit] = for { languages <- findRepositories(params.lang) _ <- languages.traverse_(_.langIndex.buildIndex) @@ -69,6 +68,7 @@ class Program(langReps: Map[String, LangRep[_ <: DefaultTable]], logger: Logger[ } object Program { - def apply(langReps: Map[String, LangRep[_ <: DefaultTable]]): IO[Program] = - Slf4jLogger.fromClass[IO](getClass).map(logger => new Program(langReps, logger)) + def apply[F[_]: Sync]( + langReps: Map[String, LangRep[_ <: DefaultTable]] + ): F[Program[F]] = Slf4jLogger.fromClass[F](getClass).map(logger => new Program(langReps, logger)) } diff --git a/core/src/main/scala/codesearch/core/config/Config.scala b/core/src/main/scala/codesearch/core/config/Config.scala index 245b96d..3b24704 100644 --- a/core/src/main/scala/codesearch/core/config/Config.scala +++ b/core/src/main/scala/codesearch/core/config/Config.scala @@ -1,7 +1,6 @@ package codesearch.core.config import java.net.URI -import java.nio.file.Path import cats.effect.Sync import pureconfig.module.catseffect._ @@ -19,7 +18,13 @@ case class DatabaseConfig( port: Int, name: String, user: String, - password: String + password: String, + properties: DatabaseProperties +) + +case class DatabaseProperties( + driver: String, + url: String ) case class SnippetConfig( @@ -29,37 +34,15 @@ case class SnippetConfig( ) case class LanguagesConfig( - haskell: HaskellConfig, - ruby: RubyConfig, - rust: RustConfig, - javascript: JavaScriptConfig -) - -case class HaskellConfig( - repoIndexUrl: URI, - repoArchivePath: Path, - repoPath: Path, - concurrentTasksCount: Int -) - -case class RubyConfig( - repoIndexUrl: URI, - repoArchivePath: Path, - repoJsonPath: Path, - scriptPath: Path, - concurrentTasksCount: Int -) - -case class RustConfig( - repoIndexUrl: URI, - repoArchivePath: Path, - repoPath: Path, - concurrentTasksCount: Int + haskell: LanguageConfig, + ruby: LanguageConfig, + rust: LanguageConfig, + javascript: LanguageConfig ) -case class JavaScriptConfig( +case class LanguageConfig( + repository: String, repoIndexUrl: URI, - repoJsonPath: Path, concurrentTasksCount: Int ) diff --git a/core/src/main/scala/codesearch/core/db/DefaultDB.scala b/core/src/main/scala/codesearch/core/db/DefaultDB.scala index fa1adc3..25ac1be 100644 --- a/core/src/main/scala/codesearch/core/db/DefaultDB.scala +++ b/core/src/main/scala/codesearch/core/db/DefaultDB.scala @@ -70,9 +70,9 @@ trait DefaultDB[T <: DefaultTable] { } def initDB: IO[Unit] = - IO.fromFuture(IO(db.run(MTable.getTables))).flatMap { vector => + IO.fromFuture(IO(db.run(MTable.getTables))).flatMap { tables => IO( - if (!vector.exists(_.name.name == table.baseTableRow.tableName)) + if (!tables.exists(_.name.name == table.baseTableRow.tableName)) db.run(table.schema.create) ) } diff --git a/core/src/main/scala/codesearch/core/db/FlywayMigration.scala b/core/src/main/scala/codesearch/core/db/FlywayMigration.scala new file mode 100644 index 0000000..09a552e --- /dev/null +++ b/core/src/main/scala/codesearch/core/db/FlywayMigration.scala @@ -0,0 +1,19 @@ +package codesearch.core.db + +import cats.effect.Sync +import codesearch.core.config.DatabaseConfig +import org.flywaydb.core.Flyway + +object FlywayMigration { + def migrate[F[_]: Sync](config: DatabaseConfig): F[Unit] = Sync[F].delay { + Flyway + .configure() + .dataSource( + config.properties.url, + config.user, + config.password + ) + .load() + .migrate() + } +} diff --git a/core/src/main/scala/codesearch/core/db/Transactor.scala b/core/src/main/scala/codesearch/core/db/Transactor.scala new file mode 100644 index 0000000..ec7cce5 --- /dev/null +++ b/core/src/main/scala/codesearch/core/db/Transactor.scala @@ -0,0 +1,24 @@ +package codesearch.core.db + +import cats.effect.{Async, ContextShift, Resource} +import codesearch.core.config.DatabaseConfig +import doobie.hikari.HikariTransactor +import doobie.util.ExecutionContexts + +object Transactor { + def create[F[_]: Async: ContextShift](config: DatabaseConfig): Resource[F, HikariTransactor[F]] = { + import config._ + for { + connectEC <- ExecutionContexts.fixedThreadPool[F](32) + transactionEC <- ExecutionContexts.cachedThreadPool[F] + xa <- HikariTransactor.newHikariTransactor( + properties.driver, + properties.url, + user, + password, + connectEC, + transactionEC + ) + } yield xa + } +} diff --git a/core/src/main/scala/codesearch/core/db/repository/PackageIndexRep.scala b/core/src/main/scala/codesearch/core/db/repository/PackageIndexRep.scala new file mode 100644 index 0000000..0942715 --- /dev/null +++ b/core/src/main/scala/codesearch/core/db/repository/PackageIndexRep.scala @@ -0,0 +1,33 @@ +package codesearch.core.db.repository + +import cats.Monad +import cats.implicits._ +import doobie._ +import doobie.implicits._ + +final case class PackageIndex( + name: String, + version: String, + repository: String +) + +trait PackageIndexRep[F[_]] { + def insertRepIndexes(packages: List[PackageIndex]): F[Int] +} + +object PackageIndexRep { + + private val batchInsertQuery = + """ + |INSERT INTO repository_index(name, version, repository) + | VALUES (?, ?, ?) + | ON CONFLICT (name, repository) DO UPDATE + | SET version = excluded.version + """.stripMargin + + def apply[F[_]: Monad](xa: Transactor[F]): PackageIndexRep[F] = + (packages: List[PackageIndex]) => + Update[PackageIndex](batchInsertQuery) + .updateMany(packages) + .transact(xa) +} diff --git a/core/src/main/scala/codesearch/core/db/repository/PackageRep.scala b/core/src/main/scala/codesearch/core/db/repository/PackageRep.scala new file mode 100644 index 0000000..211c185 --- /dev/null +++ b/core/src/main/scala/codesearch/core/db/repository/PackageRep.scala @@ -0,0 +1,35 @@ +package codesearch.core.db.repository + +import java.time.LocalDateTime + +import cats.Monad +import doobie.Transactor +import doobie.implicits._ +import fs2.Stream + +final case class PackageTableRow( + name: String, + version: String, + repository: String, + updatedAt: LocalDateTime +) + +final case class Package( + name: String, + version: String +) + +trait PackageRep[F[_]] { + def findByRepository(repository: String): Stream[F, Package] +} + +object PackageRep { + def apply[F[_]: Monad](xa: Transactor[F]): PackageRep[F] = new PackageRep[F] { + def findByRepository(repository: String): Stream[F, Package] = { + sql"SELECT name, version FROM package WHERE repository = $repository" + .query[Package] + .stream + .transact(xa) + } + } +} diff --git a/core/src/main/scala/codesearch/core/index/LanguageIndex.scala b/core/src/main/scala/codesearch/core/index/LanguageIndex.scala index 5ae08df..3039fbd 100644 --- a/core/src/main/scala/codesearch/core/index/LanguageIndex.scala +++ b/core/src/main/scala/codesearch/core/index/LanguageIndex.scala @@ -4,9 +4,14 @@ import java.nio.file.StandardCopyOption.REPLACE_EXISTING import java.nio.file.StandardOpenOption.{CREATE, TRUNCATE_EXISTING} import java.nio.file.{Files, Path => NioPath} -import cats.effect.{ContextShift, IO} +import cats.effect.{ContextShift, Sync} import cats.instances.int._ +import cats.effect._ +import cats.instances.list._ +import cats.syntax.applicative._ import cats.syntax.flatMap._ +import cats.syntax.foldable._ +import cats.syntax.traverse._ import cats.syntax.functor._ import codesearch.core.BlockingEC import codesearch.core.db.DefaultDB @@ -14,6 +19,7 @@ import codesearch.core.index.directory.{Directory, СindexDirectory} import codesearch.core.index.repository._ import codesearch.core.model.DefaultTable import codesearch.core.syntax.stream._ +import codesearch.core.util.manatki.syntax.raise._ import fs2.Stream import fs2.Chunk import fs2.io.file @@ -23,12 +29,11 @@ import io.chrisdavenport.log4cats.slf4j.Slf4jLogger import scala.sys.process.Process -trait LanguageIndex[A <: DefaultTable] { - self: DefaultDB[A] => +trait LanguageIndex[F[_]: Sync] { - protected implicit def shift: ContextShift[IO] + protected implicit def shift: ContextShift[F] - protected val logger: SelfAwareStructuredLogger[IO] = Slf4jLogger.unsafeCreate[IO] + protected val logger: SelfAwareStructuredLogger[F] = Slf4jLogger.unsafeCreate[F] protected def cindexDir: СindexDirectory @@ -38,7 +43,7 @@ trait LanguageIndex[A <: DefaultTable] { * Build new index from only latest version of each package and * replace old index with new one. */ - def buildIndex: IO[Unit] = { + def buildIndex: F[Unit] = { def latestPackagePaths = verNames.map { versions => versions.map { case (packageName, version) => @@ -46,36 +51,35 @@ trait LanguageIndex[A <: DefaultTable] { } } - def dropTempIndexFile = IO(Files.deleteIfExists(cindexDir.tempIndexDirAs[NioPath])) + def dropTempIndexFile = F(Files.deleteIfExists(cindexDir.tempIndexDirAs[NioPath])) - def createCSearchDir = IO( + def createCSearchDir = ( if (Files.notExists(СindexDirectory.root)) Files.createDirectories(СindexDirectory.root) - ) + ).pure[F].widen - def indexPackages(packageDirs: Seq[NioPath]): IO[Unit] = { + def indexPackages(packageDirs: Seq[NioPath]): F[Unit] = { val args = Seq("cindex", cindexDir.dirsToIndex[String]) val env = Seq("CSEARCHINDEX" -> cindexDir.tempIndexDirAs[String]) for { _ <- Stream .emits(packageDirs) - .covary[IO] + .covary[F] .map(_.toString + "\n") .through(utf8Encode) .through(file.writeAll(cindexDir.dirsToIndex[NioPath], BlockingEC, List(CREATE, TRUNCATE_EXISTING))) .compile .drain - _ <- IO(Process(args, None, env: _*) !) + _ <- (Process(args, None, env: _*) !).pure[F].widen } yield () } - def replaceIndexFile = IO( + def replaceIndexFile = Files.move( cindexDir.tempIndexDirAs[NioPath], cindexDir.indexDirAs[NioPath], REPLACE_EXISTING - ) - ) + ).pure[F].widen for { packageDirs <- latestPackagePaths @@ -91,9 +95,9 @@ trait LanguageIndex[A <: DefaultTable] { * * @return count of updated packages */ - def updatePackages(limit: Option[Int]): IO[Int] = { + def updatePackages(limit: Option[Int]): F[Int] = { val chunkSize = 10000 - val packages: Stream[IO, (String, String)] = getLastVersions.chunkN(chunkSize).flat.filterNotM { + val packages: Stream[F, (String, String)] = getLastVersions.chunkN(chunkSize).flat.filterNotM { case (packageName, packageVersion) => packageIsExists(packageName, packageVersion) } @@ -116,8 +120,8 @@ trait LanguageIndex[A <: DefaultTable] { protected def buildFsUrl(packageName: String, version: String): NioPath protected def archiveDownloadAndExtract[B <: SourcePackage: Directory](pack: B)( - implicit repository: SourcesDownloader[IO, B] - ): IO[Int] = { + implicit repository: SourcesDownloader[F, B] + ): F[Int] = { val task = for { _ <- repository.downloadSources(pack) rowsCount <- insertOrUpdate(pack) @@ -131,7 +135,7 @@ trait LanguageIndex[A <: DefaultTable] { * * @return last versions of packages */ - protected def getLastVersions: Stream[IO, (String, String)] + protected def getLastVersions: Stream[F, (String, String)] /** * Update source code from remote repository @@ -141,7 +145,7 @@ trait LanguageIndex[A <: DefaultTable] { * @param version of package * @return count of downloaded files (source files) */ - protected def updateSources(name: String, version: String): IO[Int] + protected def updateSources(name: String, version: String): F[Int] } case class BadExitCode(code: Int) extends Exception(s"Process returned a bad exit code: $code") diff --git a/core/src/main/scala/codesearch/core/index/details/NpmDetails.scala b/core/src/main/scala/codesearch/core/index/details/NpmDetails.scala deleted file mode 100644 index 7fa59e3..0000000 --- a/core/src/main/scala/codesearch/core/index/details/NpmDetails.scala +++ /dev/null @@ -1,39 +0,0 @@ -package codesearch.core.index.details - -import cats.effect.{ContextShift, IO} -import codesearch.core._ -import codesearch.core.config.JavaScriptConfig -import fs2.Stream -import fs2.io._ -import io.circe.fs2._ -import io.circe.generic.auto._ -import io.circe.{Decoder, HCursor} - -import scala.language.higherKinds - -private final case class NpmRegistryPackage(name: String, version: String) -private final case class NpmPackage(name: String, version: String) - -private[index] final class NpmDetails(config: JavaScriptConfig)(implicit shift: ContextShift[IO]) { - - private implicit val docDecoder: Decoder[NpmRegistryPackage] = (c: HCursor) => { - val doc = c.downField("doc") - for { - name <- doc.get[String]("name") - distTag = doc.downField("dist-tags") - tag <- distTag.get[String]("latest") - } yield NpmRegistryPackage(name, tag) - } - - def detailsMap: Stream[IO, (String, String)] = { - file - .readAll[IO](config.repoJsonPath, BlockingEC, chunkSize = 4096) - .through(byteStreamParser[IO]) - .through(decoder[IO, NpmPackage]) - .map(npmPackage => npmPackage.name -> npmPackage.version) - } -} - -private[index] object NpmDetails { - def apply(config: JavaScriptConfig)(implicit shift: ContextShift[IO]) = new NpmDetails(config) -} diff --git a/core/src/main/scala/codesearch/core/index/indexer/HaskellIndexer.scala b/core/src/main/scala/codesearch/core/index/indexer/HaskellIndexer.scala new file mode 100644 index 0000000..0eac62b --- /dev/null +++ b/core/src/main/scala/codesearch/core/index/indexer/HaskellIndexer.scala @@ -0,0 +1,15 @@ +package codesearch.core.index.indexer + +import cats.effect.{ContextShift, Sync} +import codesearch.core.index.directory.СindexDirectory.HaskellCindex +import doobie.util.transactor.Transactor + +final class HaskellIndexer[F[_]: Sync: ContextShift]( + xa: Transactor[F] +) extends SourcesIndexer[F](HaskellCindex, "hackage", xa) + +object HaskellIndexer { + def apply[F[_]: Sync: ContextShift]( + xa: Transactor[F] + ): HaskellIndexer[F] = new HaskellIndexer(xa) +} diff --git a/core/src/main/scala/codesearch/core/index/indexer/Indexer.scala b/core/src/main/scala/codesearch/core/index/indexer/Indexer.scala new file mode 100644 index 0000000..b0ad511 --- /dev/null +++ b/core/src/main/scala/codesearch/core/index/indexer/Indexer.scala @@ -0,0 +1,84 @@ +package codesearch.core.index.indexer + +import java.nio.file.StandardCopyOption.REPLACE_EXISTING +import java.nio.file.StandardOpenOption.{CREATE, TRUNCATE_EXISTING} +import java.nio.file.{Files, Path => NioPath} + +import cats.effect._ +import cats.syntax.flatMap._ +import cats.syntax.functor._ +import codesearch.core.BlockingEC +import codesearch.core.db.repository.{Package, PackageRep} +import codesearch.core.index.directory.{Directory, СindexDirectory} +import codesearch.core.syntax.path._ +import doobie.util.transactor.Transactor +import fs2.io.file +import fs2.text.utf8Encode +import fs2.{Pipe, Stream} + +import scala.sys.process.Process + +private[indexer] trait Indexer[F[_]] { + def index: F[Unit] +} + +private[indexer] class SourcesIndexer[F[_]: Sync: ContextShift]( + indexDir: СindexDirectory, + repository: String, + xa: Transactor[F] +) extends Indexer[F] { + + def index: F[Unit] = { + for { + packageDirs <- latestPackagePaths + _ <- createCSearchDir + _ <- dropTempIndexFile + _ <- dirsToIndex(packageDirs) + _ <- indexPackages + _ <- replaceIndexFile + } yield () + } + + private def latestPackagePaths: F[Stream[F, NioPath]] = Sync[F].pure( + PackageRep[F](xa) + .findByRepository(repository) + .through(buildFsPath) + ) + + private def buildFsPath: Pipe[F, Package, NioPath] = { input => + input.map(`package` => Directory.sourcesDir / repository / `package`.name / `package`.version) + } + + private def dropTempIndexFile: F[Boolean] = + Sync[F].delay(Files.deleteIfExists(indexDir.tempIndexDirAs[NioPath])) + + private def createCSearchDir: F[Option[NioPath]] = Sync[F].delay( + if (Files.notExists(СindexDirectory.root)) + Some(Files.createDirectories(СindexDirectory.root)) + else None + ) + + private def dirsToIndex(stream: Stream[F, NioPath]): F[Unit] = { + stream + .map(_.toString + "\n") + .through(utf8Encode) + .through(file.writeAll(indexDir.dirsToIndex[NioPath], BlockingEC, List(CREATE, TRUNCATE_EXISTING))) + .compile + .drain + } + + private def indexPackages: F[Unit] = { + val args = Seq("cindex", indexDir.dirsToIndex[String]) + val env = Seq("CSEARCHINDEX" -> indexDir.tempIndexDirAs[String]) + Sync[F].delay(Process(args, None, env: _*) !).void + } + + private def replaceIndexFile: F[NioPath] = + Sync[F].delay( + Files.move( + indexDir.tempIndexDirAs[NioPath], + indexDir.indexDirAs[NioPath], + REPLACE_EXISTING + ) + ) +} diff --git a/core/src/main/scala/codesearch/core/index/indexer/JavaScriptIndexer.scala b/core/src/main/scala/codesearch/core/index/indexer/JavaScriptIndexer.scala new file mode 100644 index 0000000..ced6c78 --- /dev/null +++ b/core/src/main/scala/codesearch/core/index/indexer/JavaScriptIndexer.scala @@ -0,0 +1,15 @@ +package codesearch.core.index.indexer + +import cats.effect.{ContextShift, Sync} +import codesearch.core.index.directory.СindexDirectory.JavaScriptCindex +import doobie.util.transactor.Transactor + +final class JavaScriptIndexer[F[_]: Sync: ContextShift]( + xa: Transactor[F] +) extends SourcesIndexer[F](JavaScriptCindex, "npm", xa) + +object JavaScriptIndexer { + def apply[F[_]: Sync: ContextShift]( + xa: Transactor[F] + ): JavaScriptIndexer[F] = new JavaScriptIndexer(xa) +} diff --git a/core/src/main/scala/codesearch/core/index/indexer/RubyIndexer.scala b/core/src/main/scala/codesearch/core/index/indexer/RubyIndexer.scala new file mode 100644 index 0000000..af606e2 --- /dev/null +++ b/core/src/main/scala/codesearch/core/index/indexer/RubyIndexer.scala @@ -0,0 +1,15 @@ +package codesearch.core.index.indexer + +import cats.effect.{ContextShift, Sync} +import codesearch.core.index.directory.СindexDirectory.RubyCindex +import doobie.util.transactor.Transactor + +final class RubyIndexer[F[_]: Sync: ContextShift]( + xa: Transactor[F] +) extends SourcesIndexer[F](RubyCindex, "gem", xa) + +object RubyIndexer { + def apply[F[_]: Sync: ContextShift]( + xa: Transactor[F] + ): RubyIndexer[F] = new RubyIndexer(xa) +} diff --git a/core/src/main/scala/codesearch/core/index/indexer/RustIndexer.scala b/core/src/main/scala/codesearch/core/index/indexer/RustIndexer.scala new file mode 100644 index 0000000..457e923 --- /dev/null +++ b/core/src/main/scala/codesearch/core/index/indexer/RustIndexer.scala @@ -0,0 +1,15 @@ +package codesearch.core.index.indexer + +import cats.effect.{ContextShift, Sync} +import codesearch.core.index.directory.СindexDirectory.RustCindex +import doobie.util.transactor.Transactor + +final class RustIndexer[F[_]: Sync: ContextShift]( + xa: Transactor[F] +) extends SourcesIndexer[F](RustCindex, "crates", xa) + +object RustIndexer { + def apply[F[_]: Sync: ContextShift]( + xa: Transactor[F] + ): RustIndexer[F] = new RustIndexer(xa) +} diff --git a/core/src/main/scala/codesearch/core/meta/CratesMetaDownloader.scala b/core/src/main/scala/codesearch/core/meta/CratesMetaDownloader.scala index d4e0030..79e6b11 100644 --- a/core/src/main/scala/codesearch/core/meta/CratesMetaDownloader.scala +++ b/core/src/main/scala/codesearch/core/meta/CratesMetaDownloader.scala @@ -19,7 +19,7 @@ class CratesMetaDownloader[F[_]: Monad]( logger: Logger[F] ) extends MetaDownloader[F] { - def downloadMeta: F[Unit] = { + def download: F[Unit] = { for { _ <- logger.info("Downloading rust meta information") archive <- downloader.download(Uri(config.repoIndexUrl), config.repoArchivePath) diff --git a/core/src/main/scala/codesearch/core/meta/GemMetaDownloader.scala b/core/src/main/scala/codesearch/core/meta/GemMetaDownloader.scala index 97ee37e..ea483d7 100644 --- a/core/src/main/scala/codesearch/core/meta/GemMetaDownloader.scala +++ b/core/src/main/scala/codesearch/core/meta/GemMetaDownloader.scala @@ -14,7 +14,7 @@ import scala.sys.process._ class GemMetaDownloader[F[_]: Sync](config: RubyConfig, downloader: Downloader[F], logger: Logger[F]) extends MetaDownloader[F] { - def downloadMeta: F[Unit] = + def download: F[Unit] = for { _ <- logger.info("Downloading ruby meta information") _ <- downloader.download(Uri(config.repoIndexUrl), config.repoArchivePath) diff --git a/core/src/main/scala/codesearch/core/meta/HackageMetaDownloader.scala b/core/src/main/scala/codesearch/core/meta/HackageMetaDownloader.scala index d24f2e4..f44e4a4 100644 --- a/core/src/main/scala/codesearch/core/meta/HackageMetaDownloader.scala +++ b/core/src/main/scala/codesearch/core/meta/HackageMetaDownloader.scala @@ -1,90 +1,34 @@ package codesearch.core.meta -import java.io.InputStream - -import cats.Monad -import cats.effect.{ContextShift, Sync} -import cats.syntax.flatMap._ +import cats.effect.{ConcurrentEffect, ContextShift} import cats.syntax.functor._ -import codesearch.core.BlockingEC -import codesearch.core.config.HaskellConfig +import codesearch.core.config.LanguageConfig import codesearch.core.index.repository.Downloader -import codesearch.core.util.Unarchiver -import com.softwaremill.sttp._ -import fs2.io.file -import fs2.{Pipe, Stream} +import codesearch.core.meta.unarchiver.HaskellIndexUnarchiver +import doobie.Transactor import io.chrisdavenport.log4cats.Logger import io.chrisdavenport.log4cats.slf4j.Slf4jLogger -import org.apache.commons.compress.archivers.tar.{TarArchiveEntry, TarArchiveInputStream} -import org.rauschig.jarchivelib.ArchiveFormat.TAR -import org.rauschig.jarchivelib.CompressionType.GZIP -class HackageMetaDownloader[F[_]: Monad: Sync: ContextShift]( - config: HaskellConfig, - unarchiver: Unarchiver[F], +final class HackageMetaDownloader[F[_]: ConcurrentEffect: ContextShift]( + config: LanguageConfig, downloader: Downloader[F], + xa: Transactor[F], logger: Logger[F] -) extends MetaDownloader[F] { - - def downloadMeta: F[Unit] = - for { - _ <- logger.info("Downloading haskell meta information") - _ <- downloader.download(Uri(config.repoIndexUrl), config.repoArchivePath) - _ <- unarchiver.extract(config.repoArchivePath, config.repoPath, TAR, GZIP) - _ <- logger.info("Downloading finished") - } yield () - - def storeMeta: F[Unit] = { - file - .readAll[F](config.repoArchivePath, BlockingEC, chunkSize = 4096) - .through(tarEntries) - .through(flatPackages) - } - - private final case class Package(name: String, version: String) - - private def tarEntries: Pipe[F, InputStream, TarArchiveEntry] = { input => - input.flatMap { inputStream => - Stream - .bracket(Sync[F].delay(new TarArchiveInputStream(inputStream)))(tarStream => Sync[F].delay(tarStream.close())) - .flatMap { tarStream => - Stream.unfoldEval[F, TarArchiveInputStream, TarArchiveEntry](tarStream) { tarStream => - Sync[F].delay { - tarStream.getNextTarEntry match { - case entry: TarArchiveEntry => Some(entry, tarStream) - case _ => None - } - } - } - } - } - } - - private def flatPackages: Pipe[F, TarArchiveEntry, Package] = { input => - input.flatMap { entry => - val parentName = entry.getName - val nestedEntries = entry.getDirectoryEntries - Stream.emits(nestedEntries.map(nested => Package(parentName, nested.getName))) - } - } - - private def store: Pipe[F, Package, Unit] = { input => - val batchSize = 10000 - input.chunkN(batchSize).map { packages => - val packagesBatch = packages.toList - - } //etc - } - -} +) extends IndexDownloader[F]( + config, + downloader, + xa, + logger, + HaskellIndexUnarchiver(config) + ) object HackageMetaDownloader { - def apply[F[_]: Sync: ContextShift]( - config: HaskellConfig, - unarchiver: Unarchiver[F], - downloader: Downloader[F] + def apply[F[_]: ConcurrentEffect: ContextShift]( + config: LanguageConfig, + downloader: Downloader[F], + xa: Transactor[F] ): F[MetaDownloader[F]] = for { logger <- Slf4jLogger.create - } yield new HackageMetaDownloader(config, unarchiver, downloader, logger) + } yield new HackageMetaDownloader(config, downloader, xa, logger) } diff --git a/core/src/main/scala/codesearch/core/meta/MetaDownloader.scala b/core/src/main/scala/codesearch/core/meta/MetaDownloader.scala index 95b72a9..513f96b 100644 --- a/core/src/main/scala/codesearch/core/meta/MetaDownloader.scala +++ b/core/src/main/scala/codesearch/core/meta/MetaDownloader.scala @@ -1,10 +1,51 @@ package codesearch.core.meta -trait MetaDownloader[F[_]] { +import cats.effect.{ContextShift, Sync} +import cats.syntax.flatMap._ +import cats.syntax.functor._ +import codesearch.core.config.LanguageConfig +import codesearch.core.db.repository.{PackageIndex, PackageIndexRep} +import codesearch.core.index.repository.Downloader +import codesearch.core.meta.unarchiver.StreamIndexUnarchiver +import com.softwaremill.sttp.Uri +import doobie.util.transactor.Transactor +import fs2.Pipe +import io.chrisdavenport.log4cats.Logger + +private[meta] trait MetaDownloader[F[_]] { /** * Download meta information about packages from remote repository * e.g. for Haskell is list of versions and cabal file for each version */ - def downloadMeta: F[Unit] + def download: F[Unit] +} + +private[meta] class IndexDownloader[F[_]: Sync: ContextShift]( + config: LanguageConfig, + downloader: Downloader[F], + xa: Transactor[F], + logger: Logger[F], + unarchiver: StreamIndexUnarchiver[F] +) extends MetaDownloader[F] { + + def download: F[Unit] = + for { + _ <- logger.info(s"Downloading ${config.repository} meta information") + _ <- downloader + .download(Uri(config.repoIndexUrl)) + .through(unarchiver.packages) + .through(store) + .compile + .drain + _ <- logger.info("Downloading finished") + } yield () + + private def store: Pipe[F, PackageIndex, Unit] = { input => + val batchSize = 10000 + val packageIndexRep = PackageIndexRep[F](xa) + input.chunkN(batchSize).map { packages => + packageIndexRep.insertRepIndexes(packages.toList) + } + } } diff --git a/core/src/main/scala/codesearch/core/meta/NpmMetaDownloader.scala b/core/src/main/scala/codesearch/core/meta/NpmMetaDownloader.scala index df85a83..d267db2 100644 --- a/core/src/main/scala/codesearch/core/meta/NpmMetaDownloader.scala +++ b/core/src/main/scala/codesearch/core/meta/NpmMetaDownloader.scala @@ -1,111 +1,34 @@ package codesearch.core.meta -import java.nio.file.StandardOpenOption.{CREATE, TRUNCATE_EXISTING} - import cats.effect.{ContextShift, Sync} -import cats.syntax.flatMap._ import cats.syntax.functor._ -import codesearch.core.BlockingEC -import codesearch.core.config.JavaScriptConfig +import codesearch.core.config.LanguageConfig import codesearch.core.index.repository.Downloader -import com.softwaremill.sttp._ -import fs2.io.file -import fs2.text._ -import fs2.{Pipe, Stream} -import fs2json._ +import codesearch.core.meta.unarchiver.JavaScriptUnarchiver +import doobie.util.transactor.Transactor import io.chrisdavenport.log4cats.Logger import io.chrisdavenport.log4cats.slf4j.Slf4jLogger -import io.circe._ -import io.circe.fs2._ -import io.circe.generic.auto._ -import io.circe.syntax._ -class NpmMetaDownloader[F[_]: Sync: ContextShift]( - config: JavaScriptConfig, +final class NpmMetaDownloader[F[_]: Sync: ContextShift]( + config: LanguageConfig, downloader: Downloader[F], + xa: Transactor[F], logger: Logger[F] -) extends MetaDownloader[F] { - - def downloadMeta: F[Unit] = - for { - _ <- logger.info("Downloading javascript meta information") - _ <- Sync[F].delay(config.repoJsonPath.getParent.toFile.mkdirs()) - _ <- downloader - .download(Uri(config.repoIndexUrl)) - .through(tokenParser[F]) - .through(tokenFilter) - .through(prettyPrinter()) - .through(cutStream) - .through(byteArrayParser[F]) - .through(decoder[NpmRegistryPackage]) - .map(_.asJson.noSpaces + "\n") - .through(utf8Encode) - .through(file.writeAll(config.repoJsonPath, BlockingEC, List(CREATE, TRUNCATE_EXISTING))) - .compile - .drain - _ <- logger.info("Downloading finished") - } yield () - - def tokenFilter: Pipe[F, JsonToken, JsonToken] = - TokenFilter.downObject - .downField("rows") - .downArray - .downObject - .downField("doc") - .downObject - .removeFields( - Set( - "_id", - "_rev", - "versions", - "description", - "maintainers", - "homepage", - "keywords", - "readme", - "author", - "bugs", - "license", - "readmeFilename" - ) - ) - - def cutStream: Pipe[F, Byte, Byte] = { input => - var depth = 0 - input.filter { byte => - if (byte == '[') { - depth += 1; true - } else if (byte == ']') { - depth -= 1; true - } else depth > 0 - } - } - - def decoder[A](implicit decode: Decoder[A]): Pipe[F, Json, A] = - _.flatMap { json => - decode(json.hcursor) match { - case Left(_) => Stream.empty - case Right(a) => Stream.emit(a) - } - } -} - -final case class NpmRegistryPackage(name: String, version: String) - -object NpmRegistryPackage { - implicit val docDecoder: Decoder[NpmRegistryPackage] = { c => - val doc = c.downField("doc") - for { - name <- doc.get[String]("name") - distTag = doc.downField("dist-tags") - tag <- distTag.get[String]("latest") - } yield NpmRegistryPackage(name, tag) - } -} +) extends IndexDownloader[F]( + config, + downloader, + xa, + logger, + JavaScriptUnarchiver(config) + ) object NpmMetaDownloader { - def apply[F[_]: Sync: ContextShift](config: JavaScriptConfig, downloader: Downloader[F]): F[MetaDownloader[F]] = + def apply[F[_]: Sync: ContextShift]( + config: LanguageConfig, + downloader: Downloader[F], + xa: Transactor[F] + ): F[MetaDownloader[F]] = for { logger <- Slf4jLogger.create - } yield new NpmMetaDownloader(config, downloader, logger) + } yield new NpmMetaDownloader(config, downloader, xa, logger) } diff --git a/core/src/main/scala/codesearch/core/meta/unarchiver/HaskellIndexUnarchiver.scala b/core/src/main/scala/codesearch/core/meta/unarchiver/HaskellIndexUnarchiver.scala new file mode 100644 index 0000000..fbd3f28 --- /dev/null +++ b/core/src/main/scala/codesearch/core/meta/unarchiver/HaskellIndexUnarchiver.scala @@ -0,0 +1,53 @@ +package codesearch.core.meta.unarchiver + +import java.io.InputStream + +import cats.effect.{ConcurrentEffect, Sync} +import codesearch.core.config.LanguageConfig +import codesearch.core.db.repository.PackageIndex +import fs2.io.toInputStream +import fs2.{Pipe, Stream} +import org.apache.commons.compress.archivers.tar.{TarArchiveEntry, TarArchiveInputStream} + +final class HaskellIndexUnarchiver[F[_]: ConcurrentEffect]( + config: LanguageConfig +) extends StreamIndexUnarchiver[F] { + + def packages: Pipe[F, Byte, PackageIndex] = { input => + input + .through(toInputStream) + .through(tarEntries) + .through(flatPackages) + } + + private def tarEntries: Pipe[F, InputStream, TarArchiveEntry] = { input => + input.flatMap { inputStream => + Stream + .bracket(Sync[F].delay(new TarArchiveInputStream(inputStream)))(tis => Sync[F].delay(tis.close())) + .flatMap { tarStream => + Stream.unfoldEval[F, TarArchiveInputStream, TarArchiveEntry](tarStream) { tarStream => + Sync[F].delay { + tarStream.getNextTarEntry match { + case entry: TarArchiveEntry => Some(entry, tarStream) + case _ => None + } + } + } + } + } + } + + private def flatPackages: Pipe[F, TarArchiveEntry, PackageIndex] = { input => + input.flatMap { entry => + val parentName = entry.getName + val nestedEntries = entry.getDirectoryEntries + Stream.emits(nestedEntries.map(nested => PackageIndex(parentName, nested.getName, config.repository))) + } + } +} + +object HaskellIndexUnarchiver { + def apply[F[_]: ConcurrentEffect]( + config: LanguageConfig + ): HaskellIndexUnarchiver[F] = new HaskellIndexUnarchiver(config) +} diff --git a/core/src/main/scala/codesearch/core/meta/unarchiver/JavaScriptUnarchiver.scala b/core/src/main/scala/codesearch/core/meta/unarchiver/JavaScriptUnarchiver.scala new file mode 100644 index 0000000..031c4fb --- /dev/null +++ b/core/src/main/scala/codesearch/core/meta/unarchiver/JavaScriptUnarchiver.scala @@ -0,0 +1,82 @@ +package codesearch.core.meta.unarchiver + +import cats.effect.Sync +import codesearch.core.config.LanguageConfig +import codesearch.core.db.repository.PackageIndex +import fs2.{Pipe, Stream} +import fs2json.{JsonToken, TokenFilter, prettyPrinter, tokenParser} +import io.circe.fs2.byteArrayParser +import io.circe.{Decoder, Json} + +final class JavaScriptUnarchiver[F[_]: Sync]( + config: LanguageConfig +) extends StreamIndexUnarchiver[F] { + + private implicit val docDecoder: Decoder[PackageIndex] = { cursor => + val doc = cursor.downField("doc") + for { + name <- doc.get[String]("name") + distTag = doc.downField("dist-tags") + tag <- distTag.get[String]("latest") + } yield PackageIndex(name, tag, config.repository) + } + + def packages: Pipe[F, Byte, PackageIndex] = { input => + input + .through(tokenParser[F]) + .through(tokenFilter) + .through(prettyPrinter()) + .through(cutStream) + .through(byteArrayParser[F]) + .through(decoder) + } + + private def tokenFilter: Pipe[F, JsonToken, JsonToken] = + TokenFilter.downObject + .downField("rows") + .downArray + .downObject + .downField("doc") + .downObject + .removeFields( + Set( + "_id", + "_rev", + "versions", + "description", + "maintainers", + "homepage", + "keywords", + "readme", + "author", + "bugs", + "license", + "readmeFilename" + ) + ) + + private def cutStream: Pipe[F, Byte, Byte] = { input => + var depth = 0 + input.filter { byte => + if (byte == '[') { + depth += 1; true + } else if (byte == ']') { + depth -= 1; true + } else depth > 0 + } + } + + private def decoder(implicit decode: Decoder[PackageIndex]): Pipe[F, Json, PackageIndex] = + _.flatMap { json => + decode(json.hcursor) match { + case Left(_) => Stream.empty + case Right(a) => Stream.emit(a) + } + } +} + +object JavaScriptUnarchiver { + def apply[F[_]: Sync]( + config: LanguageConfig + ): JavaScriptUnarchiver[F] = new JavaScriptUnarchiver(config) +} diff --git a/core/src/main/scala/codesearch/core/meta/unarchiver/RustIndexUnarchiver.scala b/core/src/main/scala/codesearch/core/meta/unarchiver/RustIndexUnarchiver.scala new file mode 100644 index 0000000..8e23353 --- /dev/null +++ b/core/src/main/scala/codesearch/core/meta/unarchiver/RustIndexUnarchiver.scala @@ -0,0 +1,53 @@ +package codesearch.core.meta.unarchiver + +import java.io.InputStream +import java.util.zip.ZipInputStream + +import cats.effect.{ConcurrentEffect, Sync} +import codesearch.core.config.LanguageConfig +import codesearch.core.db.repository.PackageIndex +import fs2.{Pipe, Stream} +import fs2.io.toInputStream +import org.apache.commons.compress.archivers.zip.{ZipArchiveEntry, ZipArchiveInputStream, ZipFile} + +final class RustIndexUnarchiver[F[_]: ConcurrentEffect]( + config: LanguageConfig +) extends StreamIndexUnarchiver[F] { + + def packages: Pipe[F, Byte, PackageIndex] = { input => + input + .through(toInputStream) + .through(zipEntries) + .through(flatPackages) + } + + private def zipEntries: Pipe[F, InputStream, ZipArchiveEntry] = { input => + input.flatMap { inputStream => + Stream + .bracket(Sync[F].delay(new ZipArchiveInputStream(inputStream)))(zis => Sync[F].delay(zis.close())) + .flatMap { zipStream => + Stream.unfoldEval[F, ZipArchiveInputStream, ZipArchiveEntry](zipStream) { zipStream => + Sync[F].delay { + zipStream.getNextZipEntry match { + case entry: ZipArchiveEntry => Some(entry, zipStream) + case _ => None + } + } + } + } + } + } + + private def flatPackages: Pipe[F, ZipArchiveEntry, PackageIndex] = { input => + val a = ZipInputStream + val b = ZipFile + input.flatMap(_.) + } + +} + +object RustIndexUnarchiver { + def apply[F[_]: ConcurrentEffect]( + config: LanguageConfig + ): RustIndexUnarchiver[F] = new RustIndexUnarchiver(config) +} diff --git a/core/src/main/scala/codesearch/core/meta/unarchiver/StreamIndexUnarchiver.scala b/core/src/main/scala/codesearch/core/meta/unarchiver/StreamIndexUnarchiver.scala new file mode 100644 index 0000000..6003e05 --- /dev/null +++ b/core/src/main/scala/codesearch/core/meta/unarchiver/StreamIndexUnarchiver.scala @@ -0,0 +1,8 @@ +package codesearch.core.meta.unarchiver + +import codesearch.core.db.repository.PackageIndex +import fs2.Pipe + +trait StreamIndexUnarchiver[F[_]] { + def packages: Pipe[F, Byte, PackageIndex] +} diff --git a/core/src/main/scala/codesearch/core/model/CratesTable.scala b/core/src/main/scala/codesearch/core/model/CratesTable.scala index a5447af..8e9c689 100644 --- a/core/src/main/scala/codesearch/core/model/CratesTable.scala +++ b/core/src/main/scala/codesearch/core/model/CratesTable.scala @@ -2,4 +2,4 @@ package codesearch.core.model import slick.jdbc.PostgresProfile.api._ -class CratesTable(tag: Tag) extends DefaultTable(tag, "CRATES") {} +class CratesTable(tag: Tag) extends DefaultTable(tag, "CRATES") diff --git a/core/src/main/scala/codesearch/core/model/DefaultTable.scala b/core/src/main/scala/codesearch/core/model/DefaultTable.scala index ad3bcde..0759ea1 100644 --- a/core/src/main/scala/codesearch/core/model/DefaultTable.scala +++ b/core/src/main/scala/codesearch/core/model/DefaultTable.scala @@ -7,13 +7,9 @@ import slick.jdbc.PostgresProfile.api._ // TODO: UTC class DefaultTable(tag: Tag, tableName: String) extends Table[(String, String, Timestamp)](tag, tableName) { - def packageName = column[String](s"${tableName}_PACKAGE_NAME", O.PrimaryKey) - def lastVersion = column[String](s"${tableName}_VERSION") - - def updated = column[Timestamp](s"${tableName}_UPDATED") - def * = (packageName, lastVersion, updated) - + def packageName = column[String](s"${tableName}_PACKAGE_NAME", O.PrimaryKey) + def lastVersion = column[String](s"${tableName}_VERSION") + def updated = column[Timestamp](s"${tableName}_UPDATED") def indexTimestamps = index(s"${tableName}_LAST_UPDATED", updated) - } diff --git a/core/src/main/scala/codesearch/core/model/GemTable.scala b/core/src/main/scala/codesearch/core/model/GemTable.scala index 0832e53..63cac4a 100644 --- a/core/src/main/scala/codesearch/core/model/GemTable.scala +++ b/core/src/main/scala/codesearch/core/model/GemTable.scala @@ -2,4 +2,4 @@ package codesearch.core.model import slick.jdbc.PostgresProfile.api._ -class GemTable(tag: Tag) extends DefaultTable(tag, "GEM") {} +class GemTable(tag: Tag) extends DefaultTable(tag, "GEM") diff --git a/core/src/main/scala/codesearch/core/model/HackageTable.scala b/core/src/main/scala/codesearch/core/model/HackageTable.scala index 1c70269..24b87df 100644 --- a/core/src/main/scala/codesearch/core/model/HackageTable.scala +++ b/core/src/main/scala/codesearch/core/model/HackageTable.scala @@ -2,4 +2,4 @@ package codesearch.core.model import slick.jdbc.PostgresProfile.api._ -class HackageTable(tag: Tag) extends DefaultTable(tag, "HACKAGE") {} +class HackageTable(tag: Tag) extends DefaultTable(tag, "HACKAGE") diff --git a/core/src/main/scala/codesearch/core/model/NpmTable.scala b/core/src/main/scala/codesearch/core/model/NpmTable.scala index f9a92fb..196494e 100644 --- a/core/src/main/scala/codesearch/core/model/NpmTable.scala +++ b/core/src/main/scala/codesearch/core/model/NpmTable.scala @@ -2,4 +2,4 @@ package codesearch.core.model import slick.jdbc.PostgresProfile.api._ -class NpmTable(tag: Tag) extends DefaultTable(tag, "NPM") {} +class NpmTable(tag: Tag) extends DefaultTable(tag, "NPM") diff --git a/core/src/main/scala/codesearch/core/model/Version.scala b/core/src/main/scala/codesearch/core/model/Version.scala index 719996a..8ac083b 100644 --- a/core/src/main/scala/codesearch/core/model/Version.scala +++ b/core/src/main/scala/codesearch/core/model/Version.scala @@ -1,10 +1,9 @@ package codesearch.core.model -case class Version(verString: String) extends Ordered[Version] { - import scala.math.Ordered.orderingToOrdered +import scala.math.Ordered.orderingToOrdered +case class Version(verString: String) extends Ordered[Version] { val version: Iterable[Long] = ("""\d+""".r findAllIn verString).toSeq.map(_.toLong) - override def compare(that: Version): Int = this.version compare that.version } diff --git a/core/src/main/scala/codesearch/core/util/manatki/Raise.scala b/core/src/main/scala/codesearch/core/util/manatki/Raise.scala new file mode 100644 index 0000000..22de29d --- /dev/null +++ b/core/src/main/scala/codesearch/core/util/manatki/Raise.scala @@ -0,0 +1,16 @@ +package codesearch.core.util.manatki + +import cats.ApplicativeError + +trait Raise[F[_], E] { + def raise[A](err: E): F[A] +} + +object Raise { + implicit def raiseApplicativeError[F[_], E, E1]( + implicit appErr: ApplicativeError[F, E], + sub: E1 <:< E + ): Raise[F, E1] = new Raise[F, E1] { + override def raise[A](err: E1): F[A] = appErr.raiseError(err) + } +} diff --git a/core/src/main/scala/codesearch/core/util/manatki/syntax/raise.scala b/core/src/main/scala/codesearch/core/util/manatki/syntax/raise.scala new file mode 100644 index 0000000..695fa03 --- /dev/null +++ b/core/src/main/scala/codesearch/core/util/manatki/syntax/raise.scala @@ -0,0 +1,30 @@ +package codesearch.core.util.manatki.syntax + +import cats.Applicative +import codesearch.core.util.manatki.Raise + +object raise { + final implicit class RaiseOps[E](val err: E) extends AnyVal { + def raise[F[_], A](implicit raise: Raise[F, E]): F[A] = raise.raise(err) + } + + final implicit class RaiseOptionOps[A](val opt: Option[A]) extends AnyVal { + def liftTo[F[_]] = new RaiseLiftToApplied[F, A](opt) + } + + final implicit class RaiseEitherOps[E, A](val either: Either[E, A]) extends AnyVal { + def toRaise[F[_]](implicit app: Applicative[F], raise: Raise[F, E]): F[A] = + either match { + case Left(err) => raise.raise(err) + case Right(value) => app.pure(value) + } + } + + class RaiseLiftToApplied[F[_], A](val opt: Option[A]) extends AnyVal { + def apply[E](err: => E)(implicit raise: Raise[F, E], app: Applicative[F]): F[A] = + opt match { + case None => raise.raise(err) + case Some(a) => app.pure(a) + } + } +} \ No newline at end of file diff --git a/project/Builder.scala b/project/Builder.scala index f211e3e..6a7491c 100644 --- a/project/Builder.scala +++ b/project/Builder.scala @@ -63,6 +63,7 @@ object Builder { libraryDependencies ++= Seq( "com.typesafe.slick" %% "slick" % "3.2.3", "com.typesafe.slick" %% "slick-hikaricp" % "3.2.3", + "com.github.tminglei" %% "slick-pg" % "0.17.2", "org.postgresql" % "postgresql" % "42.2.2", "com.softwaremill.sttp" %% "async-http-client-backend-fs2" % "1.3.8", "co.fs2" %% "fs2-core" % "1.0.4", @@ -77,7 +78,12 @@ object Builder { "io.chrisdavenport" %% "log4cats-slf4j" % "0.2.0-RC2", "org.apache.commons" % "commons-compress" % "1.18", "org.scalactic" %% "scalactic" % "3.0.5", - "org.scalatest" %% "scalatest" % "3.0.5" % "test" + "org.scalatest" %% "scalatest" % "3.0.5" % "test", + "org.tpolecat" %% "doobie-core" % "0.6.0", + "org.tpolecat" %% "doobie-hikari" % "0.6.0", + "org.tpolecat" %% "doobie-postgres" % "0.6.0", + "org.tpolecat" %% "doobie-specs2" % "0.6.0", + "org.flywaydb" % "flyway-core" % "5.2.4", ), assemblyMergeStrategy in assembly := { case PathList("META-INF", _ @_*) => MergeStrategy.discard From 797fc8b9782992d587469a9f4d5fc31a5cfc702f Mon Sep 17 00:00:00 2001 From: kamilongus Date: Tue, 23 Apr 2019 19:06:18 +0400 Subject: [PATCH 3/7] + some changes Issue: https://github.com/aelve/codesearch/issues/250 --- core/src/main/resources/application.conf | 8 ++ .../scala/codesearch/core/config/Config.scala | 56 ++++++++++---- .../core/db/repository/PackageIndexRep.scala | 19 ++++- .../codesearch/core/index/RustIndex.scala | 3 +- .../core/index/directory/Extractor.scala | 5 +- .../core/index/repository/Downloader.scala | 5 +- .../core/meta/CratesMetaDownloader.scala | 41 ---------- .../core/meta/GemMetaDownloader.scala | 33 --------- .../core/meta/HackageMetaDownloader.scala | 34 --------- .../codesearch/core/meta/MetaDownloader.scala | 51 ------------- .../core/meta/NpmMetaDownloader.scala | 34 --------- .../downloader/ArchivedIndexDownloader.scala | 31 ++++++++ .../ByteStreamIndexDownloader.scala | 30 ++++++++ .../downloader/CratesIndexDownloader.scala | 30 ++++++++ .../meta/downloader/GemIndexDownloader.scala | 28 +++++++ .../downloader/HackageIndexDownloader.scala | 30 ++++++++ .../meta/downloader/NpmMetaDownloader.scala | 29 ++++++++ .../RepositoryIndexDownloader.scala | 10 +++ .../meta/parser/IndexByteStreamParser.scala | 8 ++ .../JavaScriptIndexParser.scala} | 37 +++++----- .../unarchiver/HaskellIndexUnarchiver.scala | 72 +++++++++--------- .../meta/unarchiver/RubyIndexUnarchiver.scala | 45 +++++++++++ .../meta/unarchiver/RustIndexUnarchiver.scala | 74 +++++++++---------- .../unarchiver/StreamIndexUnarchiver.scala | 6 +- .../scala/codesearch/core/util/FsUtils.scala | 23 ++++++ .../codesearch/core/util/Unarchiver.scala | 6 +- 26 files changed, 435 insertions(+), 313 deletions(-) delete mode 100644 core/src/main/scala/codesearch/core/meta/CratesMetaDownloader.scala delete mode 100644 core/src/main/scala/codesearch/core/meta/GemMetaDownloader.scala delete mode 100644 core/src/main/scala/codesearch/core/meta/HackageMetaDownloader.scala delete mode 100644 core/src/main/scala/codesearch/core/meta/MetaDownloader.scala delete mode 100644 core/src/main/scala/codesearch/core/meta/NpmMetaDownloader.scala create mode 100644 core/src/main/scala/codesearch/core/meta/downloader/ArchivedIndexDownloader.scala create mode 100644 core/src/main/scala/codesearch/core/meta/downloader/ByteStreamIndexDownloader.scala create mode 100644 core/src/main/scala/codesearch/core/meta/downloader/CratesIndexDownloader.scala create mode 100644 core/src/main/scala/codesearch/core/meta/downloader/GemIndexDownloader.scala create mode 100644 core/src/main/scala/codesearch/core/meta/downloader/HackageIndexDownloader.scala create mode 100644 core/src/main/scala/codesearch/core/meta/downloader/NpmMetaDownloader.scala create mode 100644 core/src/main/scala/codesearch/core/meta/downloader/RepositoryIndexDownloader.scala create mode 100644 core/src/main/scala/codesearch/core/meta/parser/IndexByteStreamParser.scala rename core/src/main/scala/codesearch/core/meta/{unarchiver/JavaScriptUnarchiver.scala => parser/JavaScriptIndexParser.scala} (68%) create mode 100644 core/src/main/scala/codesearch/core/meta/unarchiver/RubyIndexUnarchiver.scala create mode 100644 core/src/main/scala/codesearch/core/util/FsUtils.scala diff --git a/core/src/main/resources/application.conf b/core/src/main/resources/application.conf index 0a087ad..b984e0b 100644 --- a/core/src/main/resources/application.conf +++ b/core/src/main/resources/application.conf @@ -21,16 +21,24 @@ languagesConfig { haskell { repository = "hackage" repoIndexUrl = "http://hackage.haskell.org/packages/index.tar.gz" + repoArchivePath = "./data/meta/haskell/index.tar.gz" + repoPath = "./data/meta/haskell/" concurrentTasksCount = 30 } rust { repository = "crates" repoIndexUrl = "https://github.com/rust-lang/crates.io-index/archive/master.zip" + repoArchivePath = "./data/meta/rust/archive.zip" + repoPath = "./data/meta/rust/" concurrentTasksCount = 30 + ignoreFiles = ["test-max-version-example-crate", "version-length-checking-is-overrated", "config.json", "archive.zip", ".git"] } ruby { repository = "gem" repoIndexUrl = "http://rubygems.org/latest_specs.4.8.gz" + repoArchivePath = "./data/meta/ruby/ruby_index.gz" + repoJsonPath = "./data/meta/ruby/ruby_index.json" + scriptPath = "./scripts/update_index.rb" concurrentTasksCount = 30 } javascript { diff --git a/core/src/main/scala/codesearch/core/config/Config.scala b/core/src/main/scala/codesearch/core/config/Config.scala index 3b24704..6b2e5e2 100644 --- a/core/src/main/scala/codesearch/core/config/Config.scala +++ b/core/src/main/scala/codesearch/core/config/Config.scala @@ -1,11 +1,21 @@ package codesearch.core.config import java.net.URI +import java.nio.file.Path import cats.effect.Sync import pureconfig.module.catseffect._ import pureconfig.{CamelCase, ConfigFieldMapping, ProductHint} +trait RemoteIndexConfig { + def repository: String + def repoIndexUrl: URI +} + +trait IndexArchiveConfig extends RemoteIndexConfig { + def repoArchivePath: Path +} + case class Config( db: DatabaseConfig, snippetConfig: SnippetConfig, @@ -18,13 +28,7 @@ case class DatabaseConfig( port: Int, name: String, user: String, - password: String, - properties: DatabaseProperties -) - -case class DatabaseProperties( - driver: String, - url: String + password: String ) case class SnippetConfig( @@ -34,17 +38,43 @@ case class SnippetConfig( ) case class LanguagesConfig( - haskell: LanguageConfig, - ruby: LanguageConfig, - rust: LanguageConfig, - javascript: LanguageConfig + haskell: HaskellConfig, + ruby: RubyConfig, + rust: RustConfig, + javascript: JavaScriptConfig ) -case class LanguageConfig( +case class HaskellConfig( repository: String, repoIndexUrl: URI, + repoArchivePath: Path, + repoPath: Path, concurrentTasksCount: Int -) +) extends IndexArchiveConfig + +case class RubyConfig( + repository: String, + repoIndexUrl: URI, + repoArchivePath: Path, + repoJsonPath: Path, + scriptPath: Path, + concurrentTasksCount: Int +) extends IndexArchiveConfig + +case class RustConfig( + repository: String, + repoIndexUrl: URI, + repoArchivePath: Path, + repoPath: Path, + concurrentTasksCount: Int, + ignoreFiles: Set[String] +) extends IndexArchiveConfig + +case class JavaScriptConfig( + repository: String, + repoIndexUrl: URI, + concurrentTasksCount: Int +) extends RemoteIndexConfig case class MetricsConfig( enableMatomoMetrics: Boolean diff --git a/core/src/main/scala/codesearch/core/db/repository/PackageIndexRep.scala b/core/src/main/scala/codesearch/core/db/repository/PackageIndexRep.scala index 0942715..f9d09f8 100644 --- a/core/src/main/scala/codesearch/core/db/repository/PackageIndexRep.scala +++ b/core/src/main/scala/codesearch/core/db/repository/PackageIndexRep.scala @@ -4,6 +4,7 @@ import cats.Monad import cats.implicits._ import doobie._ import doobie.implicits._ +import fs2.Stream final case class PackageIndex( name: String, @@ -12,7 +13,8 @@ final case class PackageIndex( ) trait PackageIndexRep[F[_]] { - def insertRepIndexes(packages: List[PackageIndex]): F[Int] + def insertIndexes(packages: List[PackageIndex]): F[Int] + def insertIndexes(stream: Stream[F, PackageIndex]): F[Int] } object PackageIndexRep { @@ -25,9 +27,20 @@ object PackageIndexRep { | SET version = excluded.version """.stripMargin - def apply[F[_]: Monad](xa: Transactor[F]): PackageIndexRep[F] = - (packages: List[PackageIndex]) => + def apply[F[_]: Monad](xa: Transactor[F]): PackageIndexRep[F] = new PackageIndexRep[F] { + + def insertIndexes(packages: List[PackageIndex]): F[Int] = Update[PackageIndex](batchInsertQuery) .updateMany(packages) .transact(xa) + + def insertIndexes(stream: Stream[F, PackageIndex]): F[Int] = { + val batchSize = 10000 + stream + .chunkN(batchSize) + .map(packages => insertIndexes(packages.toList)) + .compile + .drain + } + } } diff --git a/core/src/main/scala/codesearch/core/index/RustIndex.scala b/core/src/main/scala/codesearch/core/index/RustIndex.scala index e6ec163..6e46df8 100644 --- a/core/src/main/scala/codesearch/core/index/RustIndex.scala +++ b/core/src/main/scala/codesearch/core/index/RustIndex.scala @@ -26,7 +26,8 @@ class RustIndex(rustConfig: RustConfig)( "test-max-version-example-crate", "version-length-checking-is-overrated", "config.json", - "archive.zip" + "archive.zip", + ".git" ) override protected val cindexDir: СindexDirectory = RustCindex diff --git a/core/src/main/scala/codesearch/core/index/directory/Extractor.scala b/core/src/main/scala/codesearch/core/index/directory/Extractor.scala index 0cb2eea..00b7ffe 100644 --- a/core/src/main/scala/codesearch/core/index/directory/Extractor.scala +++ b/core/src/main/scala/codesearch/core/index/directory/Extractor.scala @@ -1,6 +1,5 @@ package codesearch.core.index.directory -import java.io.File import java.nio.file.Path import cats.effect.Sync @@ -18,7 +17,7 @@ private[index] trait Extractor { * @param from is file to unarchiving * @param to is target directory */ - def unzipUsingMethod[F[_]](from: Path, to: Path)(implicit F: Sync[F]): F[Unit] = F.delay( + def unzipUsingMethod[F[_]: Sync](from: Path, to: Path): F[Unit] = Sync[F].delay( ArchiverFactory .createArchiver(TAR, GZIP) .extract(from.toFile, to.toFile) @@ -41,7 +40,7 @@ private[index] trait Extractor { * @param unarchived is directory contains unarchived files * @return same directory containing all files and directories from unarchived files */ - def flatDir[F[_]](unarchived: Path)(implicit F: Sync[F]): F[Path] = F.delay { + def flatDir[F[_]: Sync](unarchived: Path): F[Path] = Sync[F].delay { val dir = unarchived.toFile dir.listFiles .filter(_.isDirectory) diff --git a/core/src/main/scala/codesearch/core/index/repository/Downloader.scala b/core/src/main/scala/codesearch/core/index/repository/Downloader.scala index fc05a16..26f25f7 100644 --- a/core/src/main/scala/codesearch/core/index/repository/Downloader.scala +++ b/core/src/main/scala/codesearch/core/index/repository/Downloader.scala @@ -28,7 +28,10 @@ object Downloader { def apply[F[_]: Downloader]: Downloader[F] = implicitly - def create[F[_]: ContextShift](implicit http: SttpBackend[F, Stream[F, ByteBuffer]], F: Sync[F]): Downloader[F] = + def create[F[_]: ContextShift]( + implicit http: SttpBackend[F, Stream[F, ByteBuffer]], + F: Sync[F] + ): Downloader[F] = new Downloader[F] { /** diff --git a/core/src/main/scala/codesearch/core/meta/CratesMetaDownloader.scala b/core/src/main/scala/codesearch/core/meta/CratesMetaDownloader.scala deleted file mode 100644 index 79e6b11..0000000 --- a/core/src/main/scala/codesearch/core/meta/CratesMetaDownloader.scala +++ /dev/null @@ -1,41 +0,0 @@ -package codesearch.core.meta - -import cats.Monad -import cats.effect.Sync -import cats.syntax.flatMap._ -import cats.syntax.functor._ -import codesearch.core.config.RustConfig -import codesearch.core.index.repository.Downloader -import codesearch.core.util.Unarchiver -import com.softwaremill.sttp.Uri -import io.chrisdavenport.log4cats.Logger -import io.chrisdavenport.log4cats.slf4j.Slf4jLogger -import org.rauschig.jarchivelib.ArchiveFormat.ZIP - -class CratesMetaDownloader[F[_]: Monad]( - config: RustConfig, - unarchiver: Unarchiver[F], - downloader: Downloader[F], - logger: Logger[F] -) extends MetaDownloader[F] { - - def download: F[Unit] = { - for { - _ <- logger.info("Downloading rust meta information") - archive <- downloader.download(Uri(config.repoIndexUrl), config.repoArchivePath) - _ <- unarchiver.extract(archive, config.repoPath, ZIP) - _ <- logger.info("Downloading finished") - } yield () - } -} - -object CratesMetaDownloader { - def apply[F[_]: Sync]( - config: RustConfig, - unarchiver: Unarchiver[F], - downloader: Downloader[F] - ): F[MetaDownloader[F]] = - for { - logger <- Slf4jLogger.create - } yield new CratesMetaDownloader(config, unarchiver, downloader, logger) -} diff --git a/core/src/main/scala/codesearch/core/meta/GemMetaDownloader.scala b/core/src/main/scala/codesearch/core/meta/GemMetaDownloader.scala deleted file mode 100644 index ea483d7..0000000 --- a/core/src/main/scala/codesearch/core/meta/GemMetaDownloader.scala +++ /dev/null @@ -1,33 +0,0 @@ -package codesearch.core.meta - -import cats.effect.Sync -import cats.syntax.flatMap._ -import cats.syntax.functor._ -import codesearch.core.config.RubyConfig -import codesearch.core.index.repository.Downloader -import com.softwaremill.sttp._ -import io.chrisdavenport.log4cats.Logger -import io.chrisdavenport.log4cats.slf4j.Slf4jLogger - -import scala.sys.process._ - -class GemMetaDownloader[F[_]: Sync](config: RubyConfig, downloader: Downloader[F], logger: Logger[F]) - extends MetaDownloader[F] { - - def download: F[Unit] = - for { - _ <- logger.info("Downloading ruby meta information") - _ <- downloader.download(Uri(config.repoIndexUrl), config.repoArchivePath) - _ <- Sync[F].delay { - Seq("ruby", config.scriptPath.toString, config.repoArchivePath.toString, config.repoJsonPath.toString) !! - } - _ <- logger.info("Downloading finished") - } yield () -} - -object GemMetaDownloader { - def apply[F[_]: Sync](config: RubyConfig, downloader: Downloader[F]): F[MetaDownloader[F]] = - for { - logger <- Slf4jLogger.create - } yield new GemMetaDownloader(config, downloader, logger) -} diff --git a/core/src/main/scala/codesearch/core/meta/HackageMetaDownloader.scala b/core/src/main/scala/codesearch/core/meta/HackageMetaDownloader.scala deleted file mode 100644 index f44e4a4..0000000 --- a/core/src/main/scala/codesearch/core/meta/HackageMetaDownloader.scala +++ /dev/null @@ -1,34 +0,0 @@ -package codesearch.core.meta - -import cats.effect.{ConcurrentEffect, ContextShift} -import cats.syntax.functor._ -import codesearch.core.config.LanguageConfig -import codesearch.core.index.repository.Downloader -import codesearch.core.meta.unarchiver.HaskellIndexUnarchiver -import doobie.Transactor -import io.chrisdavenport.log4cats.Logger -import io.chrisdavenport.log4cats.slf4j.Slf4jLogger - -final class HackageMetaDownloader[F[_]: ConcurrentEffect: ContextShift]( - config: LanguageConfig, - downloader: Downloader[F], - xa: Transactor[F], - logger: Logger[F] -) extends IndexDownloader[F]( - config, - downloader, - xa, - logger, - HaskellIndexUnarchiver(config) - ) - -object HackageMetaDownloader { - def apply[F[_]: ConcurrentEffect: ContextShift]( - config: LanguageConfig, - downloader: Downloader[F], - xa: Transactor[F] - ): F[MetaDownloader[F]] = - for { - logger <- Slf4jLogger.create - } yield new HackageMetaDownloader(config, downloader, xa, logger) -} diff --git a/core/src/main/scala/codesearch/core/meta/MetaDownloader.scala b/core/src/main/scala/codesearch/core/meta/MetaDownloader.scala deleted file mode 100644 index 513f96b..0000000 --- a/core/src/main/scala/codesearch/core/meta/MetaDownloader.scala +++ /dev/null @@ -1,51 +0,0 @@ -package codesearch.core.meta - -import cats.effect.{ContextShift, Sync} -import cats.syntax.flatMap._ -import cats.syntax.functor._ -import codesearch.core.config.LanguageConfig -import codesearch.core.db.repository.{PackageIndex, PackageIndexRep} -import codesearch.core.index.repository.Downloader -import codesearch.core.meta.unarchiver.StreamIndexUnarchiver -import com.softwaremill.sttp.Uri -import doobie.util.transactor.Transactor -import fs2.Pipe -import io.chrisdavenport.log4cats.Logger - -private[meta] trait MetaDownloader[F[_]] { - - /** - * Download meta information about packages from remote repository - * e.g. for Haskell is list of versions and cabal file for each version - */ - def download: F[Unit] -} - -private[meta] class IndexDownloader[F[_]: Sync: ContextShift]( - config: LanguageConfig, - downloader: Downloader[F], - xa: Transactor[F], - logger: Logger[F], - unarchiver: StreamIndexUnarchiver[F] -) extends MetaDownloader[F] { - - def download: F[Unit] = - for { - _ <- logger.info(s"Downloading ${config.repository} meta information") - _ <- downloader - .download(Uri(config.repoIndexUrl)) - .through(unarchiver.packages) - .through(store) - .compile - .drain - _ <- logger.info("Downloading finished") - } yield () - - private def store: Pipe[F, PackageIndex, Unit] = { input => - val batchSize = 10000 - val packageIndexRep = PackageIndexRep[F](xa) - input.chunkN(batchSize).map { packages => - packageIndexRep.insertRepIndexes(packages.toList) - } - } -} diff --git a/core/src/main/scala/codesearch/core/meta/NpmMetaDownloader.scala b/core/src/main/scala/codesearch/core/meta/NpmMetaDownloader.scala deleted file mode 100644 index d267db2..0000000 --- a/core/src/main/scala/codesearch/core/meta/NpmMetaDownloader.scala +++ /dev/null @@ -1,34 +0,0 @@ -package codesearch.core.meta - -import cats.effect.{ContextShift, Sync} -import cats.syntax.functor._ -import codesearch.core.config.LanguageConfig -import codesearch.core.index.repository.Downloader -import codesearch.core.meta.unarchiver.JavaScriptUnarchiver -import doobie.util.transactor.Transactor -import io.chrisdavenport.log4cats.Logger -import io.chrisdavenport.log4cats.slf4j.Slf4jLogger - -final class NpmMetaDownloader[F[_]: Sync: ContextShift]( - config: LanguageConfig, - downloader: Downloader[F], - xa: Transactor[F], - logger: Logger[F] -) extends IndexDownloader[F]( - config, - downloader, - xa, - logger, - JavaScriptUnarchiver(config) - ) - -object NpmMetaDownloader { - def apply[F[_]: Sync: ContextShift]( - config: LanguageConfig, - downloader: Downloader[F], - xa: Transactor[F] - ): F[MetaDownloader[F]] = - for { - logger <- Slf4jLogger.create - } yield new NpmMetaDownloader(config, downloader, xa, logger) -} diff --git a/core/src/main/scala/codesearch/core/meta/downloader/ArchivedIndexDownloader.scala b/core/src/main/scala/codesearch/core/meta/downloader/ArchivedIndexDownloader.scala new file mode 100644 index 0000000..e7a3485 --- /dev/null +++ b/core/src/main/scala/codesearch/core/meta/downloader/ArchivedIndexDownloader.scala @@ -0,0 +1,31 @@ +package codesearch.core.meta.downloader + +import cats.effect.{ContextShift, Sync} +import cats.syntax.flatMap._ +import cats.syntax.functor._ +import codesearch.core.config.IndexArchiveConfig +import codesearch.core.db.repository.PackageIndexRep +import codesearch.core.index.repository.Downloader +import codesearch.core.meta.unarchiver.StreamIndexUnarchiver +import com.softwaremill.sttp.Uri +import io.chrisdavenport.log4cats.Logger +import org.apache.commons.io.FileUtils + +private[meta] class ArchivedIndexDownloader[F[_]: Sync: ContextShift]( + config: IndexArchiveConfig, + downloader: Downloader[F], + unarchiver: StreamIndexUnarchiver[F], + indexRep: PackageIndexRep[F], + logger: Logger[F] +) extends RepositoryIndexDownloader[F] { + + def download: F[Unit] = + for { + _ <- logger.info(s"Downloading ${config.repository} meta information") + path <- downloader.download(Uri(config.repoIndexUrl), config.repoArchivePath) + stream <- unarchiver.unarchive(path) + _ <- indexRep.insertIndexes(stream) + _ <- Sync[F].delay(FileUtils.cleanDirectory(config.repoArchivePath.getParent.toFile)) + _ <- logger.info("Downloading finished") + } yield () +} diff --git a/core/src/main/scala/codesearch/core/meta/downloader/ByteStreamIndexDownloader.scala b/core/src/main/scala/codesearch/core/meta/downloader/ByteStreamIndexDownloader.scala new file mode 100644 index 0000000..c72e8b9 --- /dev/null +++ b/core/src/main/scala/codesearch/core/meta/downloader/ByteStreamIndexDownloader.scala @@ -0,0 +1,30 @@ +package codesearch.core.meta.downloader + +import cats.effect.{ContextShift, Sync} +import cats.syntax.applicative._ +import cats.syntax.flatMap._ +import cats.syntax.functor._ +import codesearch.core.config.RemoteIndexConfig +import codesearch.core.db.repository.PackageIndexRep +import codesearch.core.index.repository.Downloader +import codesearch.core.meta.parser.IndexByteStreamParser +import com.softwaremill.sttp.Uri +import io.chrisdavenport.log4cats.Logger + +class ByteStreamIndexDownloader[F[_]: Sync: ContextShift]( + config: RemoteIndexConfig, + downloader: Downloader[F], + indexRep: PackageIndexRep[F], + indexParser: IndexByteStreamParser[F], + logger: Logger[F] +) extends RepositoryIndexDownloader[F] { + + def download: F[Unit] = + for { + _ <- logger.info(s"Downloading ${config.repository} meta information") + stream <- downloader.download(Uri(config.repoIndexUrl)).pure[F] + index <- indexParser.parse(stream) + _ <- indexRep.insertIndexes(index) + _ <- logger.info("Downloading finished") + } yield () +} diff --git a/core/src/main/scala/codesearch/core/meta/downloader/CratesIndexDownloader.scala b/core/src/main/scala/codesearch/core/meta/downloader/CratesIndexDownloader.scala new file mode 100644 index 0000000..bc72203 --- /dev/null +++ b/core/src/main/scala/codesearch/core/meta/downloader/CratesIndexDownloader.scala @@ -0,0 +1,30 @@ +package codesearch.core.meta.downloader + +import cats.effect.{ContextShift, Sync} +import cats.syntax.functor._ +import codesearch.core.config.RustConfig +import codesearch.core.db.repository.PackageIndexRep +import codesearch.core.index.repository.Downloader +import codesearch.core.meta.unarchiver.RustIndexUnarchiver +import codesearch.core.util.Unarchiver +import doobie.util.transactor.Transactor +import io.chrisdavenport.log4cats.slf4j.Slf4jLogger + +object CratesIndexDownloader { + def apply[F[_]: Sync: ContextShift]( + config: RustConfig, + unarchiver: Unarchiver[F], + downloader: Downloader[F], + xa: Transactor[F] + ): F[RepositoryIndexDownloader[F]] = + for { + logger <- Slf4jLogger.create + } yield + new ArchivedIndexDownloader( + config, + downloader, + RustIndexUnarchiver(unarchiver, config), + PackageIndexRep(xa), + logger + ) +} diff --git a/core/src/main/scala/codesearch/core/meta/downloader/GemIndexDownloader.scala b/core/src/main/scala/codesearch/core/meta/downloader/GemIndexDownloader.scala new file mode 100644 index 0000000..b469151 --- /dev/null +++ b/core/src/main/scala/codesearch/core/meta/downloader/GemIndexDownloader.scala @@ -0,0 +1,28 @@ +package codesearch.core.meta.downloader + +import cats.effect.{ContextShift, Sync} +import cats.syntax.functor._ +import codesearch.core.config.RubyConfig +import codesearch.core.db.repository.PackageIndexRep +import codesearch.core.index.repository.Downloader +import codesearch.core.meta.unarchiver.RubyIndexUnarchiver +import doobie.util.transactor.Transactor +import io.chrisdavenport.log4cats.slf4j.Slf4jLogger + +object GemIndexDownloader { + def apply[F[_]: Sync: ContextShift]( + config: RubyConfig, + downloader: Downloader[F], + xa: Transactor[F] + ): F[RepositoryIndexDownloader[F]] = + for { + logger <- Slf4jLogger.create + } yield + new ArchivedIndexDownloader( + config, + downloader, + RubyIndexUnarchiver(config), + PackageIndexRep(xa), + logger + ) +} diff --git a/core/src/main/scala/codesearch/core/meta/downloader/HackageIndexDownloader.scala b/core/src/main/scala/codesearch/core/meta/downloader/HackageIndexDownloader.scala new file mode 100644 index 0000000..38806c2 --- /dev/null +++ b/core/src/main/scala/codesearch/core/meta/downloader/HackageIndexDownloader.scala @@ -0,0 +1,30 @@ +package codesearch.core.meta.downloader + +import cats.effect.{ConcurrentEffect, ContextShift} +import codesearch.core.config.HaskellConfig +import cats.syntax.functor._ +import codesearch.core.db.repository.PackageIndexRep +import codesearch.core.index.repository.Downloader +import codesearch.core.meta.unarchiver.HaskellIndexUnarchiver +import codesearch.core.util.Unarchiver +import doobie.util.transactor.Transactor +import io.chrisdavenport.log4cats.slf4j.Slf4jLogger + +object HackageIndexDownloader { + def apply[F[_]: ConcurrentEffect: ContextShift]( + config: HaskellConfig, + downloader: Downloader[F], + unarchiver: Unarchiver[F], + xa: Transactor[F] + ): F[RepositoryIndexDownloader[F]] = + for { + logger <- Slf4jLogger.create + } yield + new ArchivedIndexDownloader( + config, + downloader, + HaskellIndexUnarchiver(unarchiver, config), + PackageIndexRep(xa), + logger + ) +} diff --git a/core/src/main/scala/codesearch/core/meta/downloader/NpmMetaDownloader.scala b/core/src/main/scala/codesearch/core/meta/downloader/NpmMetaDownloader.scala new file mode 100644 index 0000000..a7119b5 --- /dev/null +++ b/core/src/main/scala/codesearch/core/meta/downloader/NpmMetaDownloader.scala @@ -0,0 +1,29 @@ +package codesearch.core.meta.downloader + +import cats.effect.{ContextShift, Sync} +import cats.syntax.functor._ +import codesearch.core.config.JavaScriptConfig +import codesearch.core.db.repository.PackageIndexRep +import codesearch.core.index.repository.Downloader +import codesearch.core.meta.parser.JavaScriptIndexParser +import doobie.util.transactor.Transactor +import io.chrisdavenport.log4cats.slf4j.Slf4jLogger + +object NpmMetaDownloader { + def apply[F[_]: Sync: ContextShift]( + config: JavaScriptConfig, + downloader: Downloader[F], + xa: Transactor[F] + ): F[RepositoryIndexDownloader[F]] = { + for { + logger <- Slf4jLogger.create + } yield + new ByteStreamIndexDownloader( + config, + downloader, + PackageIndexRep(xa), + JavaScriptIndexParser(config), + logger + ) + } +} diff --git a/core/src/main/scala/codesearch/core/meta/downloader/RepositoryIndexDownloader.scala b/core/src/main/scala/codesearch/core/meta/downloader/RepositoryIndexDownloader.scala new file mode 100644 index 0000000..9139832 --- /dev/null +++ b/core/src/main/scala/codesearch/core/meta/downloader/RepositoryIndexDownloader.scala @@ -0,0 +1,10 @@ +package codesearch.core.meta.downloader + +private[meta] trait RepositoryIndexDownloader[F[_]] { + + /** + * Download meta information about packages from remote repository + * e.g. for Haskell is list of versions and cabal file for each version + */ + def download: F[Unit] +} diff --git a/core/src/main/scala/codesearch/core/meta/parser/IndexByteStreamParser.scala b/core/src/main/scala/codesearch/core/meta/parser/IndexByteStreamParser.scala new file mode 100644 index 0000000..077bc93 --- /dev/null +++ b/core/src/main/scala/codesearch/core/meta/parser/IndexByteStreamParser.scala @@ -0,0 +1,8 @@ +package codesearch.core.meta.parser + +import codesearch.core.db.repository.PackageIndex +import fs2.Stream + +trait IndexByteStreamParser[F[_]] { + def parse(stream: Stream[F, Byte]): F[Stream[F, PackageIndex]] +} diff --git a/core/src/main/scala/codesearch/core/meta/unarchiver/JavaScriptUnarchiver.scala b/core/src/main/scala/codesearch/core/meta/parser/JavaScriptIndexParser.scala similarity index 68% rename from core/src/main/scala/codesearch/core/meta/unarchiver/JavaScriptUnarchiver.scala rename to core/src/main/scala/codesearch/core/meta/parser/JavaScriptIndexParser.scala index 031c4fb..18ac91d 100644 --- a/core/src/main/scala/codesearch/core/meta/unarchiver/JavaScriptUnarchiver.scala +++ b/core/src/main/scala/codesearch/core/meta/parser/JavaScriptIndexParser.scala @@ -1,16 +1,14 @@ -package codesearch.core.meta.unarchiver +package codesearch.core.meta.parser import cats.effect.Sync -import codesearch.core.config.LanguageConfig +import codesearch.core.config.JavaScriptConfig import codesearch.core.db.repository.PackageIndex import fs2.{Pipe, Stream} import fs2json.{JsonToken, TokenFilter, prettyPrinter, tokenParser} import io.circe.fs2.byteArrayParser import io.circe.{Decoder, Json} -final class JavaScriptUnarchiver[F[_]: Sync]( - config: LanguageConfig -) extends StreamIndexUnarchiver[F] { +final class JavaScriptIndexParser[F[_]: Sync](config: JavaScriptConfig) extends IndexByteStreamParser[F] { private implicit val docDecoder: Decoder[PackageIndex] = { cursor => val doc = cursor.downField("doc") @@ -21,14 +19,15 @@ final class JavaScriptUnarchiver[F[_]: Sync]( } yield PackageIndex(name, tag, config.repository) } - def packages: Pipe[F, Byte, PackageIndex] = { input => - input - .through(tokenParser[F]) - .through(tokenFilter) - .through(prettyPrinter()) - .through(cutStream) - .through(byteArrayParser[F]) - .through(decoder) + def parse(stream: Stream[F, Byte]): F[Stream[F, PackageIndex]] = { + Sync[F].pure( + stream + .through(tokenParser[F]) + .through(tokenFilter) + .through(prettyPrinter()) + .through(cutStream) + .through(byteArrayParser[F]) + .through(decoder)) } private def tokenFilter: Pipe[F, JsonToken, JsonToken] = @@ -66,17 +65,17 @@ final class JavaScriptUnarchiver[F[_]: Sync]( } } - private def decoder(implicit decode: Decoder[PackageIndex]): Pipe[F, Json, PackageIndex] = - _.flatMap { json => + private def decoder(implicit decode: Decoder[PackageIndex]): Pipe[F, Json, PackageIndex] = { input => + input.flatMap { json => decode(json.hcursor) match { case Left(_) => Stream.empty case Right(a) => Stream.emit(a) } } + } } -object JavaScriptUnarchiver { - def apply[F[_]: Sync]( - config: LanguageConfig - ): JavaScriptUnarchiver[F] = new JavaScriptUnarchiver(config) +object JavaScriptIndexParser { + def apply[F[_]: Sync](config: JavaScriptConfig): JavaScriptIndexParser[F] = + new JavaScriptIndexParser(config) } diff --git a/core/src/main/scala/codesearch/core/meta/unarchiver/HaskellIndexUnarchiver.scala b/core/src/main/scala/codesearch/core/meta/unarchiver/HaskellIndexUnarchiver.scala index fbd3f28..01bc28b 100644 --- a/core/src/main/scala/codesearch/core/meta/unarchiver/HaskellIndexUnarchiver.scala +++ b/core/src/main/scala/codesearch/core/meta/unarchiver/HaskellIndexUnarchiver.scala @@ -1,53 +1,53 @@ package codesearch.core.meta.unarchiver -import java.io.InputStream +import java.nio.file.Path -import cats.effect.{ConcurrentEffect, Sync} -import codesearch.core.config.LanguageConfig +import cats.Order +import cats.effect.{ConcurrentEffect, ContextShift, Sync} +import cats.instances.list._ +import cats.syntax.foldable._ +import cats.syntax.functor._ +import codesearch.core.config.HaskellConfig import codesearch.core.db.repository.PackageIndex -import fs2.io.toInputStream -import fs2.{Pipe, Stream} -import org.apache.commons.compress.archivers.tar.{TarArchiveEntry, TarArchiveInputStream} +import codesearch.core.model.Version +import codesearch.core.util.Unarchiver +import fs2.{Chunk, Stream} +import org.rauschig.jarchivelib.ArchiveFormat.TAR +import org.rauschig.jarchivelib.CompressionType.GZIP -final class HaskellIndexUnarchiver[F[_]: ConcurrentEffect]( - config: LanguageConfig +final class HaskellIndexUnarchiver[F[_]: Sync]( + unarchiver: Unarchiver[F], + config: HaskellConfig ) extends StreamIndexUnarchiver[F] { - def packages: Pipe[F, Byte, PackageIndex] = { input => - input - .through(toInputStream) - .through(tarEntries) - .through(flatPackages) + def unarchive(path: Path): F[Stream[F, PackageIndex]] = { + for { + _ <- unarchiver.extract(path, config.repoPath, TAR, GZIP) + } yield flatPackages } - private def tarEntries: Pipe[F, InputStream, TarArchiveEntry] = { input => - input.flatMap { inputStream => + private def flatPackages: F[Stream[F, PackageIndex]] = { + Sync[F].pure( Stream - .bracket(Sync[F].delay(new TarArchiveInputStream(inputStream)))(tis => Sync[F].delay(tis.close())) - .flatMap { tarStream => - Stream.unfoldEval[F, TarArchiveInputStream, TarArchiveEntry](tarStream) { tarStream => - Sync[F].delay { - tarStream.getNextTarEntry match { - case entry: TarArchiveEntry => Some(entry, tarStream) - case _ => None - } - } + .evalUnChunk(Sync[F].delay(Chunk.array(config.repoPath.toFile.listFiles))) + .filter(_.isDirectory) + .evalMap { packageDir => + Sync[F].delay { + packageDir.listFiles.toList + .filter(_.isDirectory) + .map(_.getName) + .maximumOption(Order.fromLessThan(Version.less)) + .map(version => PackageIndex(packageDir.getName, version, config.repository)) } } - } - } - - private def flatPackages: Pipe[F, TarArchiveEntry, PackageIndex] = { input => - input.flatMap { entry => - val parentName = entry.getName - val nestedEntries = entry.getDirectoryEntries - Stream.emits(nestedEntries.map(nested => PackageIndex(parentName, nested.getName, config.repository))) - } + .unNone + ) } } object HaskellIndexUnarchiver { - def apply[F[_]: ConcurrentEffect]( - config: LanguageConfig - ): HaskellIndexUnarchiver[F] = new HaskellIndexUnarchiver(config) + def apply[F[_]: ConcurrentEffect: ContextShift]( + unarchiver: Unarchiver[F], + config: HaskellConfig + ): HaskellIndexUnarchiver[F] = new HaskellIndexUnarchiver(unarchiver, config) } diff --git a/core/src/main/scala/codesearch/core/meta/unarchiver/RubyIndexUnarchiver.scala b/core/src/main/scala/codesearch/core/meta/unarchiver/RubyIndexUnarchiver.scala new file mode 100644 index 0000000..076d7c6 --- /dev/null +++ b/core/src/main/scala/codesearch/core/meta/unarchiver/RubyIndexUnarchiver.scala @@ -0,0 +1,45 @@ +package codesearch.core.meta.unarchiver + +import java.nio.file.Path + +import cats.effect.{ContextShift, Sync} +import cats.syntax.functor._ +import codesearch.core.BlockingEC +import codesearch.core.config.RubyConfig +import codesearch.core.db.repository.PackageIndex +import fs2.Stream +import fs2.io.file +import io.circe.fs2.{byteArrayParser, decoder} + +import scala.sys.process._ + +final class RubyIndexUnarchiver[F[_]: Sync: ContextShift](config: RubyConfig) extends StreamIndexUnarchiver[F] { + + def unarchive(path: Path): F[Stream[F, PackageIndex]] = { + for { + _ <- Sync[F].delay { + Seq( + "ruby", + config.scriptPath.toString, + path.toString, + config.repoJsonPath.toString + ) !! + } + } yield flatPackages + } + + private def flatPackages: F[Stream[F, PackageIndex]] = { + Sync[F].delay( + file + .readAll[F](config.repoJsonPath, BlockingEC, 4096) + .through(byteArrayParser) + .through(decoder[F, Seq[String]]) + .collect { case Seq(name, version, _) => PackageIndex(name, version, config.repository) }) + } +} + +object RubyIndexUnarchiver { + def apply[F[_]: Sync: ContextShift]( + config: RubyConfig + ): RubyIndexUnarchiver[F] = new RubyIndexUnarchiver(config) +} diff --git a/core/src/main/scala/codesearch/core/meta/unarchiver/RustIndexUnarchiver.scala b/core/src/main/scala/codesearch/core/meta/unarchiver/RustIndexUnarchiver.scala index 8e23353..f3490cb 100644 --- a/core/src/main/scala/codesearch/core/meta/unarchiver/RustIndexUnarchiver.scala +++ b/core/src/main/scala/codesearch/core/meta/unarchiver/RustIndexUnarchiver.scala @@ -1,53 +1,49 @@ package codesearch.core.meta.unarchiver -import java.io.InputStream -import java.util.zip.ZipInputStream +import java.nio.file.Path -import cats.effect.{ConcurrentEffect, Sync} -import codesearch.core.config.LanguageConfig +import cats.effect.Sync +import cats.syntax.functor._ +import codesearch.core.config.RustConfig import codesearch.core.db.repository.PackageIndex -import fs2.{Pipe, Stream} -import fs2.io.toInputStream -import org.apache.commons.compress.archivers.zip.{ZipArchiveEntry, ZipArchiveInputStream, ZipFile} - -final class RustIndexUnarchiver[F[_]: ConcurrentEffect]( - config: LanguageConfig +import codesearch.core.util.{FsUtils, Unarchiver} +import fs2.Stream +import io.circe.Decoder +import io.circe.fs2._ +import org.rauschig.jarchivelib.ArchiveFormat.ZIP + +final class RustIndexUnarchiver[F[_]: Sync]( + unarchiver: Unarchiver[F], + config: RustConfig ) extends StreamIndexUnarchiver[F] { - def packages: Pipe[F, Byte, PackageIndex] = { input => - input - .through(toInputStream) - .through(zipEntries) - .through(flatPackages) + private implicit val packageDecoder: Decoder[PackageIndex] = { cursor => + for { + name <- cursor.get[String]("name") + version <- cursor.get[String]("vers") + } yield PackageIndex(name, version, config.repository) } - private def zipEntries: Pipe[F, InputStream, ZipArchiveEntry] = { input => - input.flatMap { inputStream => - Stream - .bracket(Sync[F].delay(new ZipArchiveInputStream(inputStream)))(zis => Sync[F].delay(zis.close())) - .flatMap { zipStream => - Stream.unfoldEval[F, ZipArchiveInputStream, ZipArchiveEntry](zipStream) { zipStream => - Sync[F].delay { - zipStream.getNextZipEntry match { - case entry: ZipArchiveEntry => Some(entry, zipStream) - case _ => None - } - } - } - } - } + def unarchive(path: Path): F[Stream[F, PackageIndex]] = { + for { + _ <- unarchiver.extract(path, config.repoPath, ZIP) + } yield flatPackages } - private def flatPackages: Pipe[F, ZipArchiveEntry, PackageIndex] = { input => - val a = ZipInputStream - val b = ZipFile - input.flatMap(_.) - } - + private def flatPackages: F[Stream[F, PackageIndex]] = { + Sync[F].delay( + FsUtils + .recursiveListFiles(config.repoPath.toFile) + .filter(file => !config.ignoreFiles.contains(file.getName)) + .evalMap(file => FsUtils.readFileAsync(file.getAbsolutePath).map(_.last)) + .through(stringStreamParser) + .through(decoder[F, PackageIndex])) + } } object RustIndexUnarchiver { - def apply[F[_]: ConcurrentEffect]( - config: LanguageConfig - ): RustIndexUnarchiver[F] = new RustIndexUnarchiver(config) + def apply[F[_]: Sync]( + unarchiver: Unarchiver[F], + config: RustConfig + ): RustIndexUnarchiver[F] = new RustIndexUnarchiver(unarchiver, config) } diff --git a/core/src/main/scala/codesearch/core/meta/unarchiver/StreamIndexUnarchiver.scala b/core/src/main/scala/codesearch/core/meta/unarchiver/StreamIndexUnarchiver.scala index 6003e05..16ccd5e 100644 --- a/core/src/main/scala/codesearch/core/meta/unarchiver/StreamIndexUnarchiver.scala +++ b/core/src/main/scala/codesearch/core/meta/unarchiver/StreamIndexUnarchiver.scala @@ -1,8 +1,10 @@ package codesearch.core.meta.unarchiver +import java.nio.file.Path + import codesearch.core.db.repository.PackageIndex -import fs2.Pipe +import fs2.Stream trait StreamIndexUnarchiver[F[_]] { - def packages: Pipe[F, Byte, PackageIndex] + def unarchive(path: Path): F[Stream[F, PackageIndex]] } diff --git a/core/src/main/scala/codesearch/core/util/FsUtils.scala b/core/src/main/scala/codesearch/core/util/FsUtils.scala new file mode 100644 index 0000000..001599f --- /dev/null +++ b/core/src/main/scala/codesearch/core/util/FsUtils.scala @@ -0,0 +1,23 @@ +package codesearch.core.util + +import java.io.File + +import cats.effect.{Resource, Sync} +import fs2.{Chunk, Stream} + +import scala.io.Source + +object FsUtils { + + def recursiveListFiles[F[_]: Sync](cur: File): Stream[F, File] = { + val stream = Stream.evalUnChunk(Sync[F].delay(Chunk.array(cur.listFiles))) + val files = stream.filter(_.isFile) + val filesFromDirs = stream.filter(_.isDirectory).flatMap(recursiveListFiles) + files ++ filesFromDirs + } + + def readFileAsync[F[_]: Sync](path: String): F[List[String]] = + Resource + .fromAutoCloseable(Sync[F].delay(Source.fromFile(path, "UTF-8"))) + .use(source => Sync[F].delay(source.getLines.toList)) +} diff --git a/core/src/main/scala/codesearch/core/util/Unarchiver.scala b/core/src/main/scala/codesearch/core/util/Unarchiver.scala index 0110e5e..de4dc3b 100644 --- a/core/src/main/scala/codesearch/core/util/Unarchiver.scala +++ b/core/src/main/scala/codesearch/core/util/Unarchiver.scala @@ -21,19 +21,19 @@ trait Unarchiver[F[_]] { } object Unarchiver { - def apply[F[_]](implicit F: Sync[F]): Unarchiver[F] = new Unarchiver[F] { + def apply[F[_]: Sync]: Unarchiver[F] = new Unarchiver[F] { def extract( archive: Path, to: Path, format: ArchiveFormat, compressionType: CompressionType - ): F[Unit] = F.delay { + ): F[Unit] = Sync[F].delay { ArchiverFactory .createArchiver(format, compressionType) .extract(archive.toFile, to.toFile) } - def extract(archive: Path, to: Path, archiveFormat: ArchiveFormat): F[Unit] = F.delay { + def extract(archive: Path, to: Path, archiveFormat: ArchiveFormat): F[Unit] = Sync[F].delay { ArchiverFactory .createArchiver(archiveFormat) .extract(archive.toFile, to.toFile) From 3d2bff0465442efc65fd60f8ce95af285e675aaf Mon Sep 17 00:00:00 2001 From: kamilongus Date: Sun, 28 Apr 2019 22:34:01 +0400 Subject: [PATCH 4/7] + some changes Issue: https://github.com/aelve/codesearch/issues/250 --- core/src/main/resources/application.conf | 4 + .../scala/codesearch/core/config/Config.scala | 30 ++++-- .../db/repository/PackageDbRepository.scala | 44 +++++++++ .../repository/PackageIndexDbRepository.scala | 58 +++++++++++ .../core/db/repository/PackageIndexRep.scala | 46 --------- .../core/db/repository/PackageRep.scala | 35 ------- .../core/index/JavaScriptIndex.scala | 42 -------- .../codesearch/core/index/LanguageIndex.scala | 97 +++++++++++-------- .../codesearch/core/index/RubyIndex.scala | 50 ---------- .../codesearch/core/index/RustIndex.scala | 66 ------------- .../core/index/indexer/HaskellIndexer.scala | 6 +- .../core/index/indexer/Indexer.scala | 10 +- .../index/indexer/JavaScriptIndexer.scala | 6 +- .../core/index/indexer/RubyIndexer.scala | 6 +- .../core/index/indexer/RustIndexer.scala | 6 +- .../downloader/ArchivedIndexDownloader.scala | 10 +- .../ByteStreamIndexDownloader.scala | 14 +-- .../downloader/CratesIndexDownloader.scala | 4 +- .../meta/downloader/GemIndexDownloader.scala | 4 +- .../downloader/HackageIndexDownloader.scala | 4 +- ...nloader.scala => NpmIndexDownloader.scala} | 6 +- .../meta/parser/IndexByteStreamParser.scala | 4 +- .../meta/parser/JavaScriptIndexParser.scala | 10 +- .../unarchiver/HaskellIndexUnarchiver.scala | 10 +- .../meta/unarchiver/RubyIndexUnarchiver.scala | 12 +-- .../meta/unarchiver/RustIndexUnarchiver.scala | 16 +-- .../unarchiver/StreamIndexUnarchiver.scala | 6 +- .../HaskellPackageSourcesUpdater.scala | 5 + .../core/sources/PackageSourcesUpdater.scala | 28 ++++++ .../downloader/HaskellSourceDownloader.scala | 19 ++++ .../downloader/SourcesDownloader.scala | 38 ++++++++ .../core/sources/filter/FileFilter.scala | 45 +++++++++ .../core/sources/filter/FileFilters.scala | 30 ++++++ .../unarchiver/RubySourcesUnarchiver.scala | 22 +++++ .../unarchiver/SourcesUnarchiver.scala | 44 +++++++++ 35 files changed, 474 insertions(+), 363 deletions(-) create mode 100644 core/src/main/scala/codesearch/core/db/repository/PackageDbRepository.scala create mode 100644 core/src/main/scala/codesearch/core/db/repository/PackageIndexDbRepository.scala delete mode 100644 core/src/main/scala/codesearch/core/db/repository/PackageIndexRep.scala delete mode 100644 core/src/main/scala/codesearch/core/db/repository/PackageRep.scala delete mode 100644 core/src/main/scala/codesearch/core/index/JavaScriptIndex.scala delete mode 100644 core/src/main/scala/codesearch/core/index/RubyIndex.scala delete mode 100644 core/src/main/scala/codesearch/core/index/RustIndex.scala rename core/src/main/scala/codesearch/core/meta/downloader/{NpmMetaDownloader.scala => NpmIndexDownloader.scala} (85%) create mode 100644 core/src/main/scala/codesearch/core/sources/HaskellPackageSourcesUpdater.scala create mode 100644 core/src/main/scala/codesearch/core/sources/PackageSourcesUpdater.scala create mode 100644 core/src/main/scala/codesearch/core/sources/downloader/HaskellSourceDownloader.scala create mode 100644 core/src/main/scala/codesearch/core/sources/downloader/SourcesDownloader.scala create mode 100644 core/src/main/scala/codesearch/core/sources/filter/FileFilter.scala create mode 100644 core/src/main/scala/codesearch/core/sources/filter/FileFilters.scala create mode 100644 core/src/main/scala/codesearch/core/sources/unarchiver/RubySourcesUnarchiver.scala create mode 100644 core/src/main/scala/codesearch/core/sources/unarchiver/SourcesUnarchiver.scala diff --git a/core/src/main/resources/application.conf b/core/src/main/resources/application.conf index b984e0b..da1656a 100644 --- a/core/src/main/resources/application.conf +++ b/core/src/main/resources/application.conf @@ -21,6 +21,7 @@ languagesConfig { haskell { repository = "hackage" repoIndexUrl = "http://hackage.haskell.org/packages/index.tar.gz" + packageUrl = "https://hackage.haskell.org/package/%1$s-%2$s/%1$s-%2$s.tar.gz" repoArchivePath = "./data/meta/haskell/index.tar.gz" repoPath = "./data/meta/haskell/" concurrentTasksCount = 30 @@ -28,6 +29,7 @@ languagesConfig { rust { repository = "crates" repoIndexUrl = "https://github.com/rust-lang/crates.io-index/archive/master.zip" + packageUrl = "https://crates.io/api/v1/crates/%s/%s/download" repoArchivePath = "./data/meta/rust/archive.zip" repoPath = "./data/meta/rust/" concurrentTasksCount = 30 @@ -36,6 +38,7 @@ languagesConfig { ruby { repository = "gem" repoIndexUrl = "http://rubygems.org/latest_specs.4.8.gz" + packageUrl = "https://rubygems.org/downloads/%s-%s.gem" repoArchivePath = "./data/meta/ruby/ruby_index.gz" repoJsonPath = "./data/meta/ruby/ruby_index.json" scriptPath = "./scripts/update_index.rb" @@ -44,6 +47,7 @@ languagesConfig { javascript { repository = "npm" repoIndexUrl = "https://replicate.npmjs.com/_all_docs?include_docs=true" + packageUrl = "https://registry.npmjs.org/%1$s/-/%1$s-%2$s.tgz" concurrentTasksCount = 30 } } diff --git a/core/src/main/scala/codesearch/core/config/Config.scala b/core/src/main/scala/codesearch/core/config/Config.scala index 6b2e5e2..085de2a 100644 --- a/core/src/main/scala/codesearch/core/config/Config.scala +++ b/core/src/main/scala/codesearch/core/config/Config.scala @@ -7,12 +7,13 @@ import cats.effect.Sync import pureconfig.module.catseffect._ import pureconfig.{CamelCase, ConfigFieldMapping, ProductHint} -trait RemoteIndexConfig { +trait RepositoryConfig { def repository: String def repoIndexUrl: URI + def packageUrl: String } -trait IndexArchiveConfig extends RemoteIndexConfig { +trait ArchivedIndexConfig extends RepositoryConfig { def repoArchivePath: Path } @@ -28,7 +29,13 @@ case class DatabaseConfig( port: Int, name: String, user: String, - password: String + password: String, + properties: DatabaseProperties +) + +case class DatabaseProperties( + driver: String, + url: String ) case class SnippetConfig( @@ -47,34 +54,43 @@ case class LanguagesConfig( case class HaskellConfig( repository: String, repoIndexUrl: URI, + packageUrl: String, repoArchivePath: Path, repoPath: Path, concurrentTasksCount: Int -) extends IndexArchiveConfig +) extends ArchivedIndexConfig case class RubyConfig( repository: String, repoIndexUrl: URI, + packageUrl: String, repoArchivePath: Path, repoJsonPath: Path, scriptPath: Path, concurrentTasksCount: Int -) extends IndexArchiveConfig +) extends ArchivedIndexConfig case class RustConfig( repository: String, repoIndexUrl: URI, + packageUrl: String, repoArchivePath: Path, repoPath: Path, concurrentTasksCount: Int, ignoreFiles: Set[String] -) extends IndexArchiveConfig +) extends ArchivedIndexConfig case class JavaScriptConfig( repository: String, repoIndexUrl: URI, + packageUrl: String, concurrentTasksCount: Int -) extends RemoteIndexConfig +) extends RepositoryConfig + +case class SourceFilesExtensions( + commonExtensions: Set[String], + sourcesExtensions: Set[String] +) case class MetricsConfig( enableMatomoMetrics: Boolean diff --git a/core/src/main/scala/codesearch/core/db/repository/PackageDbRepository.scala b/core/src/main/scala/codesearch/core/db/repository/PackageDbRepository.scala new file mode 100644 index 0000000..967ccc8 --- /dev/null +++ b/core/src/main/scala/codesearch/core/db/repository/PackageDbRepository.scala @@ -0,0 +1,44 @@ +package codesearch.core.db.repository + +import cats.Monad +import doobie.Transactor +import doobie.implicits._ +import fs2.Stream + +final case class Package( + name: String, + version: String +) + +final case class PackageTableRow( + name: String, + version: String, + repository: String +) + +trait PackageDbRepository[F[_]] { + def upsert(`package`: PackageTableRow): F[Int] + def findByRepository(repository: String): Stream[F, Package] +} + +object PackageDbRepository { + def apply[F[_]: Monad](xa: Transactor[F]): PackageDbRepository[F] = new PackageDbRepository[F] { + def upsert(`package`: PackageTableRow): F[Int] = { + sql""" + INSERT INTO package(name, version, repository, updated_at) + VALUES (${`package`.name}, ${`package`.version}, ${`package`.repository}, now()) + ON CONFLICT (name, repository) DO UPDATE + SET version = excluded.version, + updated_at = excluded.updated_at + """.update.run.transact(xa) + } + + def findByRepository(repository: String): Stream[F, Package] = { + sql""" + SELECT name, version + FROM package + WHERE repository = $repository + """.query[Package].stream.transact(xa) + } + } +} diff --git a/core/src/main/scala/codesearch/core/db/repository/PackageIndexDbRepository.scala b/core/src/main/scala/codesearch/core/db/repository/PackageIndexDbRepository.scala new file mode 100644 index 0000000..625f57b --- /dev/null +++ b/core/src/main/scala/codesearch/core/db/repository/PackageIndexDbRepository.scala @@ -0,0 +1,58 @@ +package codesearch.core.db.repository + +import cats.Monad +import cats.implicits._ +import doobie._ +import doobie.implicits._ +import fs2.Stream + +final case class PackageIndexTableRow( + name: String, + version: String, + repository: String +) + +final case class PackageIndex( + name: String, + version: String +) + +trait PackageIndexDbRepository[F[_]] { + def batchUpsert(packages: List[PackageIndexTableRow]): F[Int] + def batchUpsert(stream: Stream[F, PackageIndexTableRow]): F[Int] + def findNew(repository: String): Stream[F, PackageIndex] +} + +object PackageIndexDbRepository { + def apply[F[_]: Monad](xa: Transactor[F]): PackageIndexDbRepository[F] = new PackageIndexDbRepository[F] { + + def batchUpsert(packages: List[PackageIndexTableRow]): F[Int] = { + Update[PackageIndexTableRow]( + """ + |INSERT INTO repository_index(name, version, repository) + |VALUES (?, ?, ?) + |ON CONFLICT (name, repository) DO UPDATE + | SET version = excluded.version + """.stripMargin + ).updateMany(packages).transact(xa) + } + + def batchUpsert(stream: Stream[F, PackageIndexTableRow]): F[Int] = { + val insertBatchSize = 10000 + stream + .chunkN(insertBatchSize) + .map(packages => batchUpsert(packages.toList)) + .compile + .drain + } + + def findNew(repository: String): Stream[F, PackageIndexTableRow] = { + sql""" + SELECT r.name, r.version, r.repository + FROM repository_index r + LEFT JOIN package p + ON r.name <> p.name AND r.version <> p.version + """.query[PackageIndex].stream.transact(xa) + } + } +} diff --git a/core/src/main/scala/codesearch/core/db/repository/PackageIndexRep.scala b/core/src/main/scala/codesearch/core/db/repository/PackageIndexRep.scala deleted file mode 100644 index f9d09f8..0000000 --- a/core/src/main/scala/codesearch/core/db/repository/PackageIndexRep.scala +++ /dev/null @@ -1,46 +0,0 @@ -package codesearch.core.db.repository - -import cats.Monad -import cats.implicits._ -import doobie._ -import doobie.implicits._ -import fs2.Stream - -final case class PackageIndex( - name: String, - version: String, - repository: String -) - -trait PackageIndexRep[F[_]] { - def insertIndexes(packages: List[PackageIndex]): F[Int] - def insertIndexes(stream: Stream[F, PackageIndex]): F[Int] -} - -object PackageIndexRep { - - private val batchInsertQuery = - """ - |INSERT INTO repository_index(name, version, repository) - | VALUES (?, ?, ?) - | ON CONFLICT (name, repository) DO UPDATE - | SET version = excluded.version - """.stripMargin - - def apply[F[_]: Monad](xa: Transactor[F]): PackageIndexRep[F] = new PackageIndexRep[F] { - - def insertIndexes(packages: List[PackageIndex]): F[Int] = - Update[PackageIndex](batchInsertQuery) - .updateMany(packages) - .transact(xa) - - def insertIndexes(stream: Stream[F, PackageIndex]): F[Int] = { - val batchSize = 10000 - stream - .chunkN(batchSize) - .map(packages => insertIndexes(packages.toList)) - .compile - .drain - } - } -} diff --git a/core/src/main/scala/codesearch/core/db/repository/PackageRep.scala b/core/src/main/scala/codesearch/core/db/repository/PackageRep.scala deleted file mode 100644 index 211c185..0000000 --- a/core/src/main/scala/codesearch/core/db/repository/PackageRep.scala +++ /dev/null @@ -1,35 +0,0 @@ -package codesearch.core.db.repository - -import java.time.LocalDateTime - -import cats.Monad -import doobie.Transactor -import doobie.implicits._ -import fs2.Stream - -final case class PackageTableRow( - name: String, - version: String, - repository: String, - updatedAt: LocalDateTime -) - -final case class Package( - name: String, - version: String -) - -trait PackageRep[F[_]] { - def findByRepository(repository: String): Stream[F, Package] -} - -object PackageRep { - def apply[F[_]: Monad](xa: Transactor[F]): PackageRep[F] = new PackageRep[F] { - def findByRepository(repository: String): Stream[F, Package] = { - sql"SELECT name, version FROM package WHERE repository = $repository" - .query[Package] - .stream - .transact(xa) - } - } -} diff --git a/core/src/main/scala/codesearch/core/index/JavaScriptIndex.scala b/core/src/main/scala/codesearch/core/index/JavaScriptIndex.scala deleted file mode 100644 index 0b429d0..0000000 --- a/core/src/main/scala/codesearch/core/index/JavaScriptIndex.scala +++ /dev/null @@ -1,42 +0,0 @@ -package codesearch.core.index - -import java.nio.file.Path - -import cats.effect.{ContextShift, IO} -import cats.syntax.flatMap._ -import codesearch.core.config.{Config, JavaScriptConfig} -import codesearch.core.db.NpmDB -import codesearch.core.index.details.NpmDetails -import codesearch.core.index.repository.{NpmPackage, SourcesDownloader} -import codesearch.core.index.directory.Directory._ -import codesearch.core.index.directory.Directory.ops._ -import codesearch.core.index.directory.СindexDirectory -import codesearch.core.index.directory.СindexDirectory.JavaScriptCindex -import codesearch.core.model.NpmTable -import fs2.Stream - -class JavaScriptIndex(config: JavaScriptConfig)( - implicit val shift: ContextShift[IO], - sourcesDownloader: SourcesDownloader[IO, NpmPackage] -) extends LanguageIndex[NpmTable] with NpmDB { - - override protected val cindexDir: СindexDirectory = JavaScriptCindex - - override protected def concurrentTasksCount: Int = config.concurrentTasksCount - - override protected def updateSources(name: String, version: String): IO[Int] = { - logger.info(s"downloading package $name") >> archiveDownloadAndExtract(NpmPackage(name, version)) - } - - override protected def getLastVersions: Stream[IO, (String, String)] = NpmDetails(config).detailsMap - - override protected def buildFsUrl(packageName: String, version: String): Path = - NpmPackage(packageName, version).packageDir -} - -object JavaScriptIndex { - def apply(config: Config)( - implicit shift: ContextShift[IO], - sourcesDownloader: SourcesDownloader[IO, NpmPackage] - ) = new JavaScriptIndex(config.languagesConfig.javascript) -} diff --git a/core/src/main/scala/codesearch/core/index/LanguageIndex.scala b/core/src/main/scala/codesearch/core/index/LanguageIndex.scala index 3039fbd..d4a73cd 100644 --- a/core/src/main/scala/codesearch/core/index/LanguageIndex.scala +++ b/core/src/main/scala/codesearch/core/index/LanguageIndex.scala @@ -4,24 +4,21 @@ import java.nio.file.StandardCopyOption.REPLACE_EXISTING import java.nio.file.StandardOpenOption.{CREATE, TRUNCATE_EXISTING} import java.nio.file.{Files, Path => NioPath} -import cats.effect.{ContextShift, Sync} +import cats.effect.{ContextShift, IO} import cats.instances.int._ -import cats.effect._ -import cats.instances.list._ -import cats.syntax.applicative._ +import cats.instances.vector._ import cats.syntax.flatMap._ import cats.syntax.foldable._ -import cats.syntax.traverse._ import cats.syntax.functor._ +import cats.syntax.monadError._ import codesearch.core.BlockingEC import codesearch.core.db.DefaultDB +import codesearch.core.index.directory.{Directory, СSearchDirectory} import codesearch.core.index.directory.{Directory, СindexDirectory} import codesearch.core.index.repository._ import codesearch.core.model.DefaultTable import codesearch.core.syntax.stream._ -import codesearch.core.util.manatki.syntax.raise._ import fs2.Stream -import fs2.Chunk import fs2.io.file import fs2.text.utf8Encode import io.chrisdavenport.log4cats.SelfAwareStructuredLogger @@ -29,12 +26,14 @@ import io.chrisdavenport.log4cats.slf4j.Slf4jLogger import scala.sys.process.Process -trait LanguageIndex[F[_]: Sync] { +trait LanguageIndex[A <: DefaultTable] { + self: DefaultDB[A] => - protected implicit def shift: ContextShift[F] + protected implicit def shift: ContextShift[IO] - protected val logger: SelfAwareStructuredLogger[F] = Slf4jLogger.unsafeCreate[F] + protected val logger: SelfAwareStructuredLogger[IO] = Slf4jLogger.unsafeCreate[IO] + protected def csearchDir: СSearchDirectory protected def cindexDir: СindexDirectory protected def concurrentTasksCount: Int @@ -43,7 +42,7 @@ trait LanguageIndex[F[_]: Sync] { * Build new index from only latest version of each package and * replace old index with new one. */ - def buildIndex: F[Unit] = { + def buildIndex: IO[Unit] = { def latestPackagePaths = verNames.map { versions => versions.map { case (packageName, version) => @@ -51,35 +50,50 @@ trait LanguageIndex[F[_]: Sync] { } } - def dropTempIndexFile = F(Files.deleteIfExists(cindexDir.tempIndexDirAs[NioPath])) - - def createCSearchDir = ( - if (Files.notExists(СindexDirectory.root)) - Files.createDirectories(СindexDirectory.root) - ).pure[F].widen + def dropTempIndexFile = IO(Files.deleteIfExists(csearchDir.tempIndexDirAs[NioPath])) + def dropTempIndexFile = IO(Files.deleteIfExists(cindexDir.tempIndexDirAs[NioPath])) + + def createCSearchDir = IO( + if (Files.notExists(СSearchDirectory.root)) + Files.createDirectories(СSearchDirectory.root) + if (Files.notExists(СindexDirectory.root)) + Files.createDirectories(СindexDirectory.root) + ) + + def indexPackages(packageDirs: Seq[NioPath]): IO[Unit] = { + def cindex(packages: Seq[NioPath]) = { + val args = "cindex" +: packages.map(_.toString) + val env = Seq("CSEARCHINDEX" -> csearchDir.tempIndexDirAs[String]) + IO { Process(args, None, env: _*) ! } + .ensureOr(BadExitCode)(_ == 0) + } - def indexPackages(packageDirs: Seq[NioPath]): F[Unit] = { + val batchSize = 10000 + packageDirs.grouped(batchSize).toVector.traverse_(cindex) val args = Seq("cindex", cindexDir.dirsToIndex[String]) val env = Seq("CSEARCHINDEX" -> cindexDir.tempIndexDirAs[String]) for { _ <- Stream - .emits(packageDirs) - .covary[F] - .map(_.toString + "\n") - .through(utf8Encode) - .through(file.writeAll(cindexDir.dirsToIndex[NioPath], BlockingEC, List(CREATE, TRUNCATE_EXISTING))) - .compile - .drain - _ <- (Process(args, None, env: _*) !).pure[F].widen + .emits(packageDirs) + .covary[IO] + .map(_.toString + "\n") + .through(utf8Encode) + .to(file.writeAll(cindexDir.dirsToIndex[NioPath], BlockingEC, List(CREATE, TRUNCATE_EXISTING))) + .compile + .drain + _ <- IO(Process(args, None, env: _*) !) } yield () } - def replaceIndexFile = + def replaceIndexFile = IO( Files.move( + csearchDir.tempIndexDirAs[NioPath], + csearchDir.indexDirAs[NioPath], cindexDir.tempIndexDirAs[NioPath], cindexDir.indexDirAs[NioPath], REPLACE_EXISTING - ).pure[F].widen + ) + ) for { packageDirs <- latestPackagePaths @@ -95,19 +109,17 @@ trait LanguageIndex[F[_]: Sync] { * * @return count of updated packages */ - def updatePackages(limit: Option[Int]): F[Int] = { - val chunkSize = 10000 - val packages: Stream[F, (String, String)] = getLastVersions.chunkN(chunkSize).flat.filterNotM { - case (packageName, packageVersion) => - packageIsExists(packageName, packageVersion) + def updatePackages(limit: Option[Int]): IO[Int] = { + val packages: Stream[IO, (String, String)] = getLastVersions.filterNotM { + case (packageName, packageVersion) => packageIsExists(packageName, packageVersion) } logger.debug("UPDATE PACKAGES") >> limit - .map(packages.take(_)) - .getOrElse(packages) - .mapAsyncUnordered(concurrentTasksCount)(updateSources _ tupled) - .compile - .foldMonoid + .map(packages.take(_)) + .getOrElse(packages) + .mapAsyncUnordered(concurrentTasksCount)(updateSources _ tupled) + .compile + .foldMonoid } /** @@ -120,8 +132,8 @@ trait LanguageIndex[F[_]: Sync] { protected def buildFsUrl(packageName: String, version: String): NioPath protected def archiveDownloadAndExtract[B <: SourcePackage: Directory](pack: B)( - implicit repository: SourcesDownloader[F, B] - ): F[Int] = { + implicit repository: SourcesDownloader[IO, B] + ): IO[Int] = { val task = for { _ <- repository.downloadSources(pack) rowsCount <- insertOrUpdate(pack) @@ -135,7 +147,7 @@ trait LanguageIndex[F[_]: Sync] { * * @return last versions of packages */ - protected def getLastVersions: Stream[F, (String, String)] + protected def getLastVersions: Stream[IO, (String, String)] /** * Update source code from remote repository @@ -145,7 +157,8 @@ trait LanguageIndex[F[_]: Sync] { * @param version of package * @return count of downloaded files (source files) */ - protected def updateSources(name: String, version: String): F[Int] + protected def updateSources(name: String, version: String): IO[Int] } case class BadExitCode(code: Int) extends Exception(s"Process returned a bad exit code: $code") + diff --git a/core/src/main/scala/codesearch/core/index/RubyIndex.scala b/core/src/main/scala/codesearch/core/index/RubyIndex.scala deleted file mode 100644 index 2f3c8c1..0000000 --- a/core/src/main/scala/codesearch/core/index/RubyIndex.scala +++ /dev/null @@ -1,50 +0,0 @@ -package codesearch.core.index - -import java.nio.file.Path - -import cats.effect.{ContextShift, IO} -import cats.syntax.flatMap._ -import codesearch.core._ -import codesearch.core.config.{Config, RubyConfig} -import codesearch.core.db.GemDB -import codesearch.core.index.directory.Directory._ -import codesearch.core.index.directory.Directory.ops._ -import codesearch.core.index.directory.СindexDirectory -import codesearch.core.index.directory.СindexDirectory.RubyCindex -import codesearch.core.index.repository.{GemPackage, SourcesDownloader} -import codesearch.core.model.GemTable -import io.circe.fs2._ -import fs2.Stream -import fs2.io.file - -class RubyIndex(rubyConfig: RubyConfig)( - implicit val shift: ContextShift[IO], - sourcesDownloader: SourcesDownloader[IO, GemPackage] -) extends LanguageIndex[GemTable] with GemDB { - - override protected val cindexDir: СindexDirectory = RubyCindex - - override protected def concurrentTasksCount: Int = rubyConfig.concurrentTasksCount - - override protected def updateSources(name: String, version: String): IO[Int] = { - logger.info(s"downloading package $name") >> archiveDownloadAndExtract(GemPackage(name, version)) - } - - override protected def getLastVersions: Stream[IO, (String, String)] = { - file - .readAll[IO](rubyConfig.repoJsonPath, BlockingEC, 4096) - .through(byteArrayParser[IO]) - .through(decoder[IO, Seq[String]]) - .collect { case Seq(name, version, _) => name -> version } - } - - override protected def buildFsUrl(packageName: String, version: String): Path = - GemPackage(packageName, version).packageDir -} - -object RubyIndex { - def apply(config: Config)( - implicit shift: ContextShift[IO], - sourcesDownloader: SourcesDownloader[IO, GemPackage] - ) = new RubyIndex(config.languagesConfig.ruby) -} diff --git a/core/src/main/scala/codesearch/core/index/RustIndex.scala b/core/src/main/scala/codesearch/core/index/RustIndex.scala deleted file mode 100644 index 6e46df8..0000000 --- a/core/src/main/scala/codesearch/core/index/RustIndex.scala +++ /dev/null @@ -1,66 +0,0 @@ -package codesearch.core.index - -import java.nio.file.Path - -import cats.effect.{ContextShift, IO} -import cats.syntax.flatMap._ -import codesearch.core.config.{Config, RustConfig} -import codesearch.core.db.CratesDB -import codesearch.core.index.directory.Directory._ -import codesearch.core.index.directory.Directory.ops._ -import codesearch.core.index.directory.СindexDirectory -import codesearch.core.index.directory.СindexDirectory.RustCindex -import codesearch.core.index.repository.{CratesPackage, SourcesDownloader} -import codesearch.core.model.CratesTable -import codesearch.core.util.Helper -import fs2.Stream -import io.circe.Decoder -import io.circe.fs2._ - -class RustIndex(rustConfig: RustConfig)( - implicit val shift: ContextShift[IO], - sourcesDownloader: SourcesDownloader[IO, CratesPackage] -) extends LanguageIndex[CratesTable] with CratesDB { - - private val IgnoreFiles = Set( - "test-max-version-example-crate", - "version-length-checking-is-overrated", - "config.json", - "archive.zip", - ".git" - ) - - override protected val cindexDir: СindexDirectory = RustCindex - - override protected def concurrentTasksCount: Int = rustConfig.concurrentTasksCount - - override protected def updateSources(name: String, version: String): IO[Int] = { - logger.info(s"downloading package $name") >> archiveDownloadAndExtract(CratesPackage(name, version)) - } - - override protected def getLastVersions: Stream[IO, (String, String)] = { - implicit val packageDecoder: Decoder[(String, String)] = { c => - for { - name <- c.get[String]("name") - version <- c.get[String]("vers") - } yield name -> version - } - - Helper - .recursiveListFiles(rustConfig.repoPath.toFile) - .filter(file => !IgnoreFiles.contains(file.getName)) - .evalMap(file => Helper.readFileAsync(file.getAbsolutePath).map(_.last)) - .through(stringStreamParser) - .through(decoder[IO, (String, String)]) - } - - override protected def buildFsUrl(packageName: String, version: String): Path = - CratesPackage(packageName, version).packageDir -} - -object RustIndex { - def apply(config: Config)( - implicit shift: ContextShift[IO], - sourcesDownloader: SourcesDownloader[IO, CratesPackage] - ) = new RustIndex(config.languagesConfig.rust) -} diff --git a/core/src/main/scala/codesearch/core/index/indexer/HaskellIndexer.scala b/core/src/main/scala/codesearch/core/index/indexer/HaskellIndexer.scala index 0eac62b..fc332ba 100644 --- a/core/src/main/scala/codesearch/core/index/indexer/HaskellIndexer.scala +++ b/core/src/main/scala/codesearch/core/index/indexer/HaskellIndexer.scala @@ -4,12 +4,8 @@ import cats.effect.{ContextShift, Sync} import codesearch.core.index.directory.СindexDirectory.HaskellCindex import doobie.util.transactor.Transactor -final class HaskellIndexer[F[_]: Sync: ContextShift]( - xa: Transactor[F] -) extends SourcesIndexer[F](HaskellCindex, "hackage", xa) - object HaskellIndexer { def apply[F[_]: Sync: ContextShift]( xa: Transactor[F] - ): HaskellIndexer[F] = new HaskellIndexer(xa) + ): SourcesIndexer[F] = new SourcesIndexer(HaskellCindex, "hackage", xa) } diff --git a/core/src/main/scala/codesearch/core/index/indexer/Indexer.scala b/core/src/main/scala/codesearch/core/index/indexer/Indexer.scala index b0ad511..a0516dd 100644 --- a/core/src/main/scala/codesearch/core/index/indexer/Indexer.scala +++ b/core/src/main/scala/codesearch/core/index/indexer/Indexer.scala @@ -8,7 +8,7 @@ import cats.effect._ import cats.syntax.flatMap._ import cats.syntax.functor._ import codesearch.core.BlockingEC -import codesearch.core.db.repository.{Package, PackageRep} +import codesearch.core.db.repository.{Package, PackageDbRepository} import codesearch.core.index.directory.{Directory, СindexDirectory} import codesearch.core.syntax.path._ import doobie.util.transactor.Transactor @@ -40,7 +40,7 @@ private[indexer] class SourcesIndexer[F[_]: Sync: ContextShift]( } private def latestPackagePaths: F[Stream[F, NioPath]] = Sync[F].pure( - PackageRep[F](xa) + PackageDbRepository[F](xa) .findByRepository(repository) .through(buildFsPath) ) @@ -49,15 +49,15 @@ private[indexer] class SourcesIndexer[F[_]: Sync: ContextShift]( input.map(`package` => Directory.sourcesDir / repository / `package`.name / `package`.version) } - private def dropTempIndexFile: F[Boolean] = - Sync[F].delay(Files.deleteIfExists(indexDir.tempIndexDirAs[NioPath])) - private def createCSearchDir: F[Option[NioPath]] = Sync[F].delay( if (Files.notExists(СindexDirectory.root)) Some(Files.createDirectories(СindexDirectory.root)) else None ) + private def dropTempIndexFile: F[Boolean] = + Sync[F].delay(Files.deleteIfExists(indexDir.tempIndexDirAs[NioPath])) + private def dirsToIndex(stream: Stream[F, NioPath]): F[Unit] = { stream .map(_.toString + "\n") diff --git a/core/src/main/scala/codesearch/core/index/indexer/JavaScriptIndexer.scala b/core/src/main/scala/codesearch/core/index/indexer/JavaScriptIndexer.scala index ced6c78..683f204 100644 --- a/core/src/main/scala/codesearch/core/index/indexer/JavaScriptIndexer.scala +++ b/core/src/main/scala/codesearch/core/index/indexer/JavaScriptIndexer.scala @@ -4,12 +4,8 @@ import cats.effect.{ContextShift, Sync} import codesearch.core.index.directory.СindexDirectory.JavaScriptCindex import doobie.util.transactor.Transactor -final class JavaScriptIndexer[F[_]: Sync: ContextShift]( - xa: Transactor[F] -) extends SourcesIndexer[F](JavaScriptCindex, "npm", xa) - object JavaScriptIndexer { def apply[F[_]: Sync: ContextShift]( xa: Transactor[F] - ): JavaScriptIndexer[F] = new JavaScriptIndexer(xa) + ): SourcesIndexer[F] = new SourcesIndexer[F](JavaScriptCindex, "npm", xa) } diff --git a/core/src/main/scala/codesearch/core/index/indexer/RubyIndexer.scala b/core/src/main/scala/codesearch/core/index/indexer/RubyIndexer.scala index af606e2..3f46991 100644 --- a/core/src/main/scala/codesearch/core/index/indexer/RubyIndexer.scala +++ b/core/src/main/scala/codesearch/core/index/indexer/RubyIndexer.scala @@ -4,12 +4,8 @@ import cats.effect.{ContextShift, Sync} import codesearch.core.index.directory.СindexDirectory.RubyCindex import doobie.util.transactor.Transactor -final class RubyIndexer[F[_]: Sync: ContextShift]( - xa: Transactor[F] -) extends SourcesIndexer[F](RubyCindex, "gem", xa) - object RubyIndexer { def apply[F[_]: Sync: ContextShift]( xa: Transactor[F] - ): RubyIndexer[F] = new RubyIndexer(xa) + ): SourcesIndexer[F] = new SourcesIndexer(RubyCindex, "gem", xa) } diff --git a/core/src/main/scala/codesearch/core/index/indexer/RustIndexer.scala b/core/src/main/scala/codesearch/core/index/indexer/RustIndexer.scala index 457e923..b76b042 100644 --- a/core/src/main/scala/codesearch/core/index/indexer/RustIndexer.scala +++ b/core/src/main/scala/codesearch/core/index/indexer/RustIndexer.scala @@ -4,12 +4,8 @@ import cats.effect.{ContextShift, Sync} import codesearch.core.index.directory.СindexDirectory.RustCindex import doobie.util.transactor.Transactor -final class RustIndexer[F[_]: Sync: ContextShift]( - xa: Transactor[F] -) extends SourcesIndexer[F](RustCindex, "crates", xa) - object RustIndexer { def apply[F[_]: Sync: ContextShift]( xa: Transactor[F] - ): RustIndexer[F] = new RustIndexer(xa) + ): SourcesIndexer[F] = new SourcesIndexer(RustCindex, "crates", xa) } diff --git a/core/src/main/scala/codesearch/core/meta/downloader/ArchivedIndexDownloader.scala b/core/src/main/scala/codesearch/core/meta/downloader/ArchivedIndexDownloader.scala index e7a3485..7311d29 100644 --- a/core/src/main/scala/codesearch/core/meta/downloader/ArchivedIndexDownloader.scala +++ b/core/src/main/scala/codesearch/core/meta/downloader/ArchivedIndexDownloader.scala @@ -3,8 +3,8 @@ package codesearch.core.meta.downloader import cats.effect.{ContextShift, Sync} import cats.syntax.flatMap._ import cats.syntax.functor._ -import codesearch.core.config.IndexArchiveConfig -import codesearch.core.db.repository.PackageIndexRep +import codesearch.core.config.ArchivedIndexConfig +import codesearch.core.db.repository.PackageIndexDbRepository import codesearch.core.index.repository.Downloader import codesearch.core.meta.unarchiver.StreamIndexUnarchiver import com.softwaremill.sttp.Uri @@ -12,10 +12,10 @@ import io.chrisdavenport.log4cats.Logger import org.apache.commons.io.FileUtils private[meta] class ArchivedIndexDownloader[F[_]: Sync: ContextShift]( - config: IndexArchiveConfig, + config: ArchivedIndexConfig, downloader: Downloader[F], unarchiver: StreamIndexUnarchiver[F], - indexRep: PackageIndexRep[F], + indexRep: PackageIndexDbRepository[F], logger: Logger[F] ) extends RepositoryIndexDownloader[F] { @@ -24,7 +24,7 @@ private[meta] class ArchivedIndexDownloader[F[_]: Sync: ContextShift]( _ <- logger.info(s"Downloading ${config.repository} meta information") path <- downloader.download(Uri(config.repoIndexUrl), config.repoArchivePath) stream <- unarchiver.unarchive(path) - _ <- indexRep.insertIndexes(stream) + _ <- indexRep.batchUpsert(stream) _ <- Sync[F].delay(FileUtils.cleanDirectory(config.repoArchivePath.getParent.toFile)) _ <- logger.info("Downloading finished") } yield () diff --git a/core/src/main/scala/codesearch/core/meta/downloader/ByteStreamIndexDownloader.scala b/core/src/main/scala/codesearch/core/meta/downloader/ByteStreamIndexDownloader.scala index c72e8b9..1744309 100644 --- a/core/src/main/scala/codesearch/core/meta/downloader/ByteStreamIndexDownloader.scala +++ b/core/src/main/scala/codesearch/core/meta/downloader/ByteStreamIndexDownloader.scala @@ -4,17 +4,17 @@ import cats.effect.{ContextShift, Sync} import cats.syntax.applicative._ import cats.syntax.flatMap._ import cats.syntax.functor._ -import codesearch.core.config.RemoteIndexConfig -import codesearch.core.db.repository.PackageIndexRep +import codesearch.core.config.RepositoryConfig +import codesearch.core.db.repository.PackageIndexDbRepository import codesearch.core.index.repository.Downloader import codesearch.core.meta.parser.IndexByteStreamParser import com.softwaremill.sttp.Uri import io.chrisdavenport.log4cats.Logger -class ByteStreamIndexDownloader[F[_]: Sync: ContextShift]( - config: RemoteIndexConfig, +private[meta] class ByteStreamIndexDownloader[F[_]: Sync: ContextShift]( + config: RepositoryConfig, downloader: Downloader[F], - indexRep: PackageIndexRep[F], + indexRep: PackageIndexDbRepository[F], indexParser: IndexByteStreamParser[F], logger: Logger[F] ) extends RepositoryIndexDownloader[F] { @@ -22,9 +22,9 @@ class ByteStreamIndexDownloader[F[_]: Sync: ContextShift]( def download: F[Unit] = for { _ <- logger.info(s"Downloading ${config.repository} meta information") - stream <- downloader.download(Uri(config.repoIndexUrl)).pure[F] + stream <- downloader.download(Uri(config.repoIndexUrl)).pure[F].widen index <- indexParser.parse(stream) - _ <- indexRep.insertIndexes(index) + _ <- indexRep.batchUpsert(index) _ <- logger.info("Downloading finished") } yield () } diff --git a/core/src/main/scala/codesearch/core/meta/downloader/CratesIndexDownloader.scala b/core/src/main/scala/codesearch/core/meta/downloader/CratesIndexDownloader.scala index bc72203..5b8778e 100644 --- a/core/src/main/scala/codesearch/core/meta/downloader/CratesIndexDownloader.scala +++ b/core/src/main/scala/codesearch/core/meta/downloader/CratesIndexDownloader.scala @@ -3,7 +3,7 @@ package codesearch.core.meta.downloader import cats.effect.{ContextShift, Sync} import cats.syntax.functor._ import codesearch.core.config.RustConfig -import codesearch.core.db.repository.PackageIndexRep +import codesearch.core.db.repository.PackageIndexDbRepository import codesearch.core.index.repository.Downloader import codesearch.core.meta.unarchiver.RustIndexUnarchiver import codesearch.core.util.Unarchiver @@ -24,7 +24,7 @@ object CratesIndexDownloader { config, downloader, RustIndexUnarchiver(unarchiver, config), - PackageIndexRep(xa), + PackageIndexDbRepository(xa), logger ) } diff --git a/core/src/main/scala/codesearch/core/meta/downloader/GemIndexDownloader.scala b/core/src/main/scala/codesearch/core/meta/downloader/GemIndexDownloader.scala index b469151..0a03a0f 100644 --- a/core/src/main/scala/codesearch/core/meta/downloader/GemIndexDownloader.scala +++ b/core/src/main/scala/codesearch/core/meta/downloader/GemIndexDownloader.scala @@ -3,7 +3,7 @@ package codesearch.core.meta.downloader import cats.effect.{ContextShift, Sync} import cats.syntax.functor._ import codesearch.core.config.RubyConfig -import codesearch.core.db.repository.PackageIndexRep +import codesearch.core.db.repository.PackageIndexDbRepository import codesearch.core.index.repository.Downloader import codesearch.core.meta.unarchiver.RubyIndexUnarchiver import doobie.util.transactor.Transactor @@ -22,7 +22,7 @@ object GemIndexDownloader { config, downloader, RubyIndexUnarchiver(config), - PackageIndexRep(xa), + PackageIndexDbRepository(xa), logger ) } diff --git a/core/src/main/scala/codesearch/core/meta/downloader/HackageIndexDownloader.scala b/core/src/main/scala/codesearch/core/meta/downloader/HackageIndexDownloader.scala index 38806c2..11be5e1 100644 --- a/core/src/main/scala/codesearch/core/meta/downloader/HackageIndexDownloader.scala +++ b/core/src/main/scala/codesearch/core/meta/downloader/HackageIndexDownloader.scala @@ -3,7 +3,7 @@ package codesearch.core.meta.downloader import cats.effect.{ConcurrentEffect, ContextShift} import codesearch.core.config.HaskellConfig import cats.syntax.functor._ -import codesearch.core.db.repository.PackageIndexRep +import codesearch.core.db.repository.PackageIndexDbRepository import codesearch.core.index.repository.Downloader import codesearch.core.meta.unarchiver.HaskellIndexUnarchiver import codesearch.core.util.Unarchiver @@ -24,7 +24,7 @@ object HackageIndexDownloader { config, downloader, HaskellIndexUnarchiver(unarchiver, config), - PackageIndexRep(xa), + PackageIndexDbRepository(xa), logger ) } diff --git a/core/src/main/scala/codesearch/core/meta/downloader/NpmMetaDownloader.scala b/core/src/main/scala/codesearch/core/meta/downloader/NpmIndexDownloader.scala similarity index 85% rename from core/src/main/scala/codesearch/core/meta/downloader/NpmMetaDownloader.scala rename to core/src/main/scala/codesearch/core/meta/downloader/NpmIndexDownloader.scala index a7119b5..172fd13 100644 --- a/core/src/main/scala/codesearch/core/meta/downloader/NpmMetaDownloader.scala +++ b/core/src/main/scala/codesearch/core/meta/downloader/NpmIndexDownloader.scala @@ -3,13 +3,13 @@ package codesearch.core.meta.downloader import cats.effect.{ContextShift, Sync} import cats.syntax.functor._ import codesearch.core.config.JavaScriptConfig -import codesearch.core.db.repository.PackageIndexRep +import codesearch.core.db.repository.PackageIndexDbRepository import codesearch.core.index.repository.Downloader import codesearch.core.meta.parser.JavaScriptIndexParser import doobie.util.transactor.Transactor import io.chrisdavenport.log4cats.slf4j.Slf4jLogger -object NpmMetaDownloader { +object NpmIndexDownloader { def apply[F[_]: Sync: ContextShift]( config: JavaScriptConfig, downloader: Downloader[F], @@ -21,7 +21,7 @@ object NpmMetaDownloader { new ByteStreamIndexDownloader( config, downloader, - PackageIndexRep(xa), + PackageIndexDbRepository(xa), JavaScriptIndexParser(config), logger ) diff --git a/core/src/main/scala/codesearch/core/meta/parser/IndexByteStreamParser.scala b/core/src/main/scala/codesearch/core/meta/parser/IndexByteStreamParser.scala index 077bc93..7cb1966 100644 --- a/core/src/main/scala/codesearch/core/meta/parser/IndexByteStreamParser.scala +++ b/core/src/main/scala/codesearch/core/meta/parser/IndexByteStreamParser.scala @@ -1,8 +1,8 @@ package codesearch.core.meta.parser -import codesearch.core.db.repository.PackageIndex +import codesearch.core.db.repository.PackageIndexTableRow import fs2.Stream trait IndexByteStreamParser[F[_]] { - def parse(stream: Stream[F, Byte]): F[Stream[F, PackageIndex]] + def parse(stream: Stream[F, Byte]): F[Stream[F, PackageIndexTableRow]] } diff --git a/core/src/main/scala/codesearch/core/meta/parser/JavaScriptIndexParser.scala b/core/src/main/scala/codesearch/core/meta/parser/JavaScriptIndexParser.scala index 18ac91d..223da08 100644 --- a/core/src/main/scala/codesearch/core/meta/parser/JavaScriptIndexParser.scala +++ b/core/src/main/scala/codesearch/core/meta/parser/JavaScriptIndexParser.scala @@ -2,7 +2,7 @@ package codesearch.core.meta.parser import cats.effect.Sync import codesearch.core.config.JavaScriptConfig -import codesearch.core.db.repository.PackageIndex +import codesearch.core.db.repository.PackageIndexTableRow import fs2.{Pipe, Stream} import fs2json.{JsonToken, TokenFilter, prettyPrinter, tokenParser} import io.circe.fs2.byteArrayParser @@ -10,16 +10,16 @@ import io.circe.{Decoder, Json} final class JavaScriptIndexParser[F[_]: Sync](config: JavaScriptConfig) extends IndexByteStreamParser[F] { - private implicit val docDecoder: Decoder[PackageIndex] = { cursor => + private implicit val docDecoder: Decoder[PackageIndexTableRow] = { cursor => val doc = cursor.downField("doc") for { name <- doc.get[String]("name") distTag = doc.downField("dist-tags") tag <- distTag.get[String]("latest") - } yield PackageIndex(name, tag, config.repository) + } yield PackageIndexTableRow(name, tag, config.repository) } - def parse(stream: Stream[F, Byte]): F[Stream[F, PackageIndex]] = { + def parse(stream: Stream[F, Byte]): F[Stream[F, PackageIndexTableRow]] = { Sync[F].pure( stream .through(tokenParser[F]) @@ -65,7 +65,7 @@ final class JavaScriptIndexParser[F[_]: Sync](config: JavaScriptConfig) extends } } - private def decoder(implicit decode: Decoder[PackageIndex]): Pipe[F, Json, PackageIndex] = { input => + private def decoder(implicit decode: Decoder[PackageIndexTableRow]): Pipe[F, Json, PackageIndexTableRow] = { input => input.flatMap { json => decode(json.hcursor) match { case Left(_) => Stream.empty diff --git a/core/src/main/scala/codesearch/core/meta/unarchiver/HaskellIndexUnarchiver.scala b/core/src/main/scala/codesearch/core/meta/unarchiver/HaskellIndexUnarchiver.scala index 01bc28b..5d66923 100644 --- a/core/src/main/scala/codesearch/core/meta/unarchiver/HaskellIndexUnarchiver.scala +++ b/core/src/main/scala/codesearch/core/meta/unarchiver/HaskellIndexUnarchiver.scala @@ -8,25 +8,25 @@ import cats.instances.list._ import cats.syntax.foldable._ import cats.syntax.functor._ import codesearch.core.config.HaskellConfig -import codesearch.core.db.repository.PackageIndex +import codesearch.core.db.repository.PackageIndexTableRow import codesearch.core.model.Version import codesearch.core.util.Unarchiver import fs2.{Chunk, Stream} import org.rauschig.jarchivelib.ArchiveFormat.TAR import org.rauschig.jarchivelib.CompressionType.GZIP -final class HaskellIndexUnarchiver[F[_]: Sync]( +private[meta] final class HaskellIndexUnarchiver[F[_]: Sync]( unarchiver: Unarchiver[F], config: HaskellConfig ) extends StreamIndexUnarchiver[F] { - def unarchive(path: Path): F[Stream[F, PackageIndex]] = { + def unarchive(path: Path): F[Stream[F, PackageIndexTableRow]] = { for { _ <- unarchiver.extract(path, config.repoPath, TAR, GZIP) } yield flatPackages } - private def flatPackages: F[Stream[F, PackageIndex]] = { + private def flatPackages: F[Stream[F, PackageIndexTableRow]] = { Sync[F].pure( Stream .evalUnChunk(Sync[F].delay(Chunk.array(config.repoPath.toFile.listFiles))) @@ -37,7 +37,7 @@ final class HaskellIndexUnarchiver[F[_]: Sync]( .filter(_.isDirectory) .map(_.getName) .maximumOption(Order.fromLessThan(Version.less)) - .map(version => PackageIndex(packageDir.getName, version, config.repository)) + .map(version => PackageIndexTableRow(packageDir.getName, version, config.repository)) } } .unNone diff --git a/core/src/main/scala/codesearch/core/meta/unarchiver/RubyIndexUnarchiver.scala b/core/src/main/scala/codesearch/core/meta/unarchiver/RubyIndexUnarchiver.scala index 076d7c6..a667668 100644 --- a/core/src/main/scala/codesearch/core/meta/unarchiver/RubyIndexUnarchiver.scala +++ b/core/src/main/scala/codesearch/core/meta/unarchiver/RubyIndexUnarchiver.scala @@ -6,16 +6,16 @@ import cats.effect.{ContextShift, Sync} import cats.syntax.functor._ import codesearch.core.BlockingEC import codesearch.core.config.RubyConfig -import codesearch.core.db.repository.PackageIndex +import codesearch.core.db.repository.PackageIndexTableRow import fs2.Stream import fs2.io.file import io.circe.fs2.{byteArrayParser, decoder} import scala.sys.process._ -final class RubyIndexUnarchiver[F[_]: Sync: ContextShift](config: RubyConfig) extends StreamIndexUnarchiver[F] { +private[meta] final class RubyIndexUnarchiver[F[_]: Sync: ContextShift](config: RubyConfig) extends StreamIndexUnarchiver[F] { - def unarchive(path: Path): F[Stream[F, PackageIndex]] = { + def unarchive(path: Path): F[Stream[F, PackageIndexTableRow]] = { for { _ <- Sync[F].delay { Seq( @@ -28,17 +28,17 @@ final class RubyIndexUnarchiver[F[_]: Sync: ContextShift](config: RubyConfig) ex } yield flatPackages } - private def flatPackages: F[Stream[F, PackageIndex]] = { + private def flatPackages: F[Stream[F, PackageIndexTableRow]] = { Sync[F].delay( file .readAll[F](config.repoJsonPath, BlockingEC, 4096) .through(byteArrayParser) .through(decoder[F, Seq[String]]) - .collect { case Seq(name, version, _) => PackageIndex(name, version, config.repository) }) + .collect { case Seq(name, version, _) => PackageIndexTableRow(name, version, config.repository) }) } } -object RubyIndexUnarchiver { +private[meta] object RubyIndexUnarchiver { def apply[F[_]: Sync: ContextShift]( config: RubyConfig ): RubyIndexUnarchiver[F] = new RubyIndexUnarchiver(config) diff --git a/core/src/main/scala/codesearch/core/meta/unarchiver/RustIndexUnarchiver.scala b/core/src/main/scala/codesearch/core/meta/unarchiver/RustIndexUnarchiver.scala index f3490cb..2b73c5f 100644 --- a/core/src/main/scala/codesearch/core/meta/unarchiver/RustIndexUnarchiver.scala +++ b/core/src/main/scala/codesearch/core/meta/unarchiver/RustIndexUnarchiver.scala @@ -5,43 +5,43 @@ import java.nio.file.Path import cats.effect.Sync import cats.syntax.functor._ import codesearch.core.config.RustConfig -import codesearch.core.db.repository.PackageIndex +import codesearch.core.db.repository.PackageIndexTableRow import codesearch.core.util.{FsUtils, Unarchiver} import fs2.Stream import io.circe.Decoder import io.circe.fs2._ import org.rauschig.jarchivelib.ArchiveFormat.ZIP -final class RustIndexUnarchiver[F[_]: Sync]( +private[meta] final class RustIndexUnarchiver[F[_]: Sync]( unarchiver: Unarchiver[F], config: RustConfig ) extends StreamIndexUnarchiver[F] { - private implicit val packageDecoder: Decoder[PackageIndex] = { cursor => + private implicit val packageDecoder: Decoder[PackageIndexTableRow] = { cursor => for { name <- cursor.get[String]("name") version <- cursor.get[String]("vers") - } yield PackageIndex(name, version, config.repository) + } yield PackageIndexTableRow(name, version, config.repository) } - def unarchive(path: Path): F[Stream[F, PackageIndex]] = { + def unarchive(path: Path): F[Stream[F, PackageIndexTableRow]] = { for { _ <- unarchiver.extract(path, config.repoPath, ZIP) } yield flatPackages } - private def flatPackages: F[Stream[F, PackageIndex]] = { + private def flatPackages: F[Stream[F, PackageIndexTableRow]] = { Sync[F].delay( FsUtils .recursiveListFiles(config.repoPath.toFile) .filter(file => !config.ignoreFiles.contains(file.getName)) .evalMap(file => FsUtils.readFileAsync(file.getAbsolutePath).map(_.last)) .through(stringStreamParser) - .through(decoder[F, PackageIndex])) + .through(decoder[F, PackageIndexTableRow])) } } -object RustIndexUnarchiver { +private[meta] object RustIndexUnarchiver { def apply[F[_]: Sync]( unarchiver: Unarchiver[F], config: RustConfig diff --git a/core/src/main/scala/codesearch/core/meta/unarchiver/StreamIndexUnarchiver.scala b/core/src/main/scala/codesearch/core/meta/unarchiver/StreamIndexUnarchiver.scala index 16ccd5e..bd2b358 100644 --- a/core/src/main/scala/codesearch/core/meta/unarchiver/StreamIndexUnarchiver.scala +++ b/core/src/main/scala/codesearch/core/meta/unarchiver/StreamIndexUnarchiver.scala @@ -2,9 +2,9 @@ package codesearch.core.meta.unarchiver import java.nio.file.Path -import codesearch.core.db.repository.PackageIndex +import codesearch.core.db.repository.PackageIndexTableRow import fs2.Stream -trait StreamIndexUnarchiver[F[_]] { - def unarchive(path: Path): F[Stream[F, PackageIndex]] +private[meta] trait StreamIndexUnarchiver[F[_]] { + def unarchive(path: Path): F[Stream[F, PackageIndexTableRow]] } diff --git a/core/src/main/scala/codesearch/core/sources/HaskellPackageSourcesUpdater.scala b/core/src/main/scala/codesearch/core/sources/HaskellPackageSourcesUpdater.scala new file mode 100644 index 0000000..fee5495 --- /dev/null +++ b/core/src/main/scala/codesearch/core/sources/HaskellPackageSourcesUpdater.scala @@ -0,0 +1,5 @@ +package codesearch.core.sources + +final class HaskellPackageSourcesUpdater[F[_]] extends SourcesUpdater[F] { + def update: F[Unit] = +} diff --git a/core/src/main/scala/codesearch/core/sources/PackageSourcesUpdater.scala b/core/src/main/scala/codesearch/core/sources/PackageSourcesUpdater.scala new file mode 100644 index 0000000..09a34f6 --- /dev/null +++ b/core/src/main/scala/codesearch/core/sources/PackageSourcesUpdater.scala @@ -0,0 +1,28 @@ +package codesearch.core.sources + +import cats.effect.Sync +import codesearch.core.config.RepositoryConfig +import cats.syntax.applicative._ +import cats.syntax.functor._ +import codesearch.core.db.repository.{Package, PackageIndexDbRepository} +import codesearch.core.sources.downloader.SourcesDownloader + +trait SourcesUpdater[F[_]] { + def update: F[Unit] +} + +class PackageSourcesUpdater[F[_]: Sync]( + config: RepositoryConfig, + indexRep: PackageIndexDbRepository[F], + downloader: SourcesDownloader[F, A] +) extends SourcesUpdater[F] { + + def update: F[Unit] = { + for { + newIndexes <- indexRep.findNew(config.repository).pure[F] + + } yield () + } + + private def download(`package`: Package): F[Unit] = {} +} diff --git a/core/src/main/scala/codesearch/core/sources/downloader/HaskellSourceDownloader.scala b/core/src/main/scala/codesearch/core/sources/downloader/HaskellSourceDownloader.scala new file mode 100644 index 0000000..5844cff --- /dev/null +++ b/core/src/main/scala/codesearch/core/sources/downloader/HaskellSourceDownloader.scala @@ -0,0 +1,19 @@ +package codesearch.core.sources.downloader + +import codesearch.core.index.repository.Downloader +import cats.syntax.functor._ +import codesearch.core.config.HaskellConfig +import io.chrisdavenport.log4cats.slf4j.Slf4jLogger + +object HaskellSourceDownloader { + def apply[F[_]]( + downloader: Downloader[F], + config: HaskellConfig + ): SourcesDownloader[F] = + for { + logger <- Slf4jLogger.create + } yield SourcesDownloader( + downloader, + + ) +} diff --git a/core/src/main/scala/codesearch/core/sources/downloader/SourcesDownloader.scala b/core/src/main/scala/codesearch/core/sources/downloader/SourcesDownloader.scala new file mode 100644 index 0000000..b4ec7f8 --- /dev/null +++ b/core/src/main/scala/codesearch/core/sources/downloader/SourcesDownloader.scala @@ -0,0 +1,38 @@ +package codesearch.core.sources.downloader + +import java.nio.file.Paths + +import cats.effect.Sync +import cats.syntax.flatMap._ +import cats.syntax.functor._ +import codesearch.core.config.RepositoryConfig +import codesearch.core.db.repository.{PackageDbRepository, PackageIndexTableRow, PackageTableRow} +import codesearch.core.index.repository.Downloader +import codesearch.core.sources.filter.FileFilter +import codesearch.core.sources.unarchiver.SourcesUnarchiver +import com.softwaremill.sttp._ +import io.chrisdavenport.log4cats.Logger + +trait SourcesDownloader[F[_]] { + def downloadSources(packageIndex: PackageIndexTableRow): F[Unit] +} + +object SourcesDownloader { + def apply[F[_]: Sync]( + downloader: Downloader[F], + unarchiver: SourcesUnarchiver[F], + fileFilter: FileFilter[F], + packageDbRepository: PackageDbRepository[F], + config: RepositoryConfig, + logger: Logger[F] + ): SourcesDownloader[F] = (index: PackageIndexTableRow) => { + val packageUrl = uri"${config.packageUrl.format(index.name, index.version)}" + for { + _ <- logger.info(s"Downloading ${index.name}-${index.version} sources") + archive <- downloader.download(packageUrl, Paths.get("")) + sourcesDir <- unarchiver.unarchive(archive) + _ <- fileFilter.filter(sourcesDir) + _ <- packageDbRepository.upsert(PackageTableRow(index.name, index.version, index.repository)) + } yield () + } +} diff --git a/core/src/main/scala/codesearch/core/sources/filter/FileFilter.scala b/core/src/main/scala/codesearch/core/sources/filter/FileFilter.scala new file mode 100644 index 0000000..e510933 --- /dev/null +++ b/core/src/main/scala/codesearch/core/sources/filter/FileFilter.scala @@ -0,0 +1,45 @@ +package codesearch.core.sources.filter + +import java.io.File +import java.nio.file.Path + +import cats.effect.Sync +import cats.instances.list._ +import cats.syntax.applicative._ +import cats.syntax.flatMap._ +import cats.syntax.functor._ +import cats.syntax.traverse._ +import codesearch.core.index.repository.Extensions +import org.apache.commons.io.FilenameUtils.getExtension + +trait FileFilter[F[_]] { + def filter(dir: Path): F[Int] +} + +object FileFilter { + def apply[F[_]: Sync]( + extensions: Extensions, + allowedFileNames: Set[String] + ): FileFilter[F] = new FileFilter[F] { + + private val maxFileSize: Int = 1024 * 1024 + + def filter(dir: Path): F[Int] = Sync[F].delay(deleteRecursively(dir.toFile, filter)) + + private def deleteRecursively(dir: File, predicate: File => Boolean): F[Int] = { + for { + (dirs, files) <- Sync[F].delay(dir.listFiles.toList.partition(_.isDirectory)) + filesDeleted <- files.filterNot(predicate).traverse(file => Sync[F].delay(file.delete)).map(_.size) + nestedFilesDeleted <- dirs.traverse(dir => deleteRecursively(dir, predicate)).map(_.size) + _ <- Sync[F].delay(dir.delete).whenA(dir.listFiles.isEmpty) + } yield filesDeleted + nestedFilesDeleted + } + + private def filter(file: File): Boolean = { + val fileName = file.getName.toLowerCase + val fileExt = getExtension(fileName) + (if (fileExt.isEmpty) allowedFileNames.contains(fileName) + else extensions.extensions.contains(fileExt)) && file.length < maxFileSize + } + } +} diff --git a/core/src/main/scala/codesearch/core/sources/filter/FileFilters.scala b/core/src/main/scala/codesearch/core/sources/filter/FileFilters.scala new file mode 100644 index 0000000..83f4bb9 --- /dev/null +++ b/core/src/main/scala/codesearch/core/sources/filter/FileFilters.scala @@ -0,0 +1,30 @@ +package codesearch.core.sources.filter + +import cats.effect.Sync +import codesearch.core.config.{HaskellConfig, JavaScriptConfig, RubyConfig, RustConfig} +import codesearch.core.index.repository.Extensions.{ + HaskellExtensions, + JavaScriptExtensions, + RubyExtensions, + RustExtensions +} + +object HaskellFileFilter { + def apply[F[_]: Sync](config: HaskellConfig): FileFilter[F] = + FileFilter[F](HaskellExtensions, Set[String]()) +} + +object JavaScriptFileFilter { + def apply[F[_]: Sync](config: JavaScriptConfig): FileFilter[F] = + FileFilter(JavaScriptExtensions, Set[String]()) +} + +object RubyFileFilter { + def apply[F[_]: Sync](config: RubyConfig): FileFilter[F] = + FileFilter(RubyExtensions, Set[String]()) +} + +object RustFileFilter { + def apply[F[_]: Sync](config: RustConfig): FileFilter[F] = + FileFilter(RustExtensions, Set[String]()) +} diff --git a/core/src/main/scala/codesearch/core/sources/unarchiver/RubySourcesUnarchiver.scala b/core/src/main/scala/codesearch/core/sources/unarchiver/RubySourcesUnarchiver.scala new file mode 100644 index 0000000..30f806c --- /dev/null +++ b/core/src/main/scala/codesearch/core/sources/unarchiver/RubySourcesUnarchiver.scala @@ -0,0 +1,22 @@ +package codesearch.core.sources.unarchiver + +import java.nio.file.Path + +import cats.effect.Sync +import org.rauschig.jarchivelib.ArchiveFormat.TAR +import org.rauschig.jarchivelib.ArchiverFactory +import org.rauschig.jarchivelib.CompressionType.GZIP + +object RubySourcesUnarchiver { + def apply[F[_]: Sync]: SourcesUnarchiver[F] = + (archive: Path, directory: Path) => + Sync[F].delay { + val destDir = directory.toFile + val allowedSet = Set("tgz", "tar.gz") + ArchiverFactory.createArchiver(TAR).extract(archive.toFile, destDir) + destDir.listFiles + .filter(file => allowedSet.exists(file.getName.toLowerCase.endsWith)) + .foreach(file => ArchiverFactory.createArchiver(TAR, GZIP).extract(file, destDir)) + directory + } +} diff --git a/core/src/main/scala/codesearch/core/sources/unarchiver/SourcesUnarchiver.scala b/core/src/main/scala/codesearch/core/sources/unarchiver/SourcesUnarchiver.scala new file mode 100644 index 0000000..852d2d6 --- /dev/null +++ b/core/src/main/scala/codesearch/core/sources/unarchiver/SourcesUnarchiver.scala @@ -0,0 +1,44 @@ +package codesearch.core.sources.unarchiver + +import java.nio.file.Path + +import cats.effect.Sync +import cats.syntax.flatMap._ +import cats.syntax.functor._ +import org.apache.commons.io.FileUtils.{moveDirectoryToDirectory, moveFileToDirectory} +import org.rauschig.jarchivelib.ArchiveFormat.TAR +import org.rauschig.jarchivelib.ArchiverFactory +import org.rauschig.jarchivelib.CompressionType.GZIP + +trait SourcesUnarchiver[F[_]] { + + /** Return directory containing all unarchived files and directories + * + * @param archive is file to unarchiving + * @param directory is target directory + * @return directory containing all unarchived files and directories + */ + def unarchive(archive: Path, directory: Path): F[Path] +} + +object SourcesUnarchiver { + def apply[F[_]: Sync]: SourcesUnarchiver[F] = new SourcesUnarchiver[F]() { + def unarchive(archive: Path, directory: Path): F[Path] = { + for { + _ <- Sync[F].delay(ArchiverFactory.createArchiver(TAR, GZIP).extract(archive.toFile, directory.toFile)) + path <- flatDir(directory) + } yield path + } + + private def flatDir(unarchived: Path): F[Path] = Sync[F].delay { + val dir = unarchived.toFile + val notCreateDestDir = false + dir.listFiles + .filter(_.isDirectory) + .foreach(_.listFiles.foreach(file => + if (file.isDirectory) moveDirectoryToDirectory(file, dir, notCreateDestDir) + else moveFileToDirectory(file, dir, notCreateDestDir))) + unarchived + } + } +} From 398191dadd574897dfb91ce87c0f667f02256dc1 Mon Sep 17 00:00:00 2001 From: kamilongus Date: Thu, 2 May 2019 15:54:37 +0500 Subject: [PATCH 5/7] + some changes Issue: https://github.com/aelve/codesearch/issues/250 --- .../scala/codesearch/core/config/Config.scala | 36 +++++++++----- .../db/repository/PackageDbRepository.scala | 6 +-- .../repository/PackageIndexDbRepository.scala | 10 ++-- .../downloader/ArchivedIndexDownloader.scala | 16 +++---- .../ByteStreamIndexDownloader.scala | 6 +-- .../unarchiver/HaskellIndexUnarchiver.scala | 2 +- .../meta/unarchiver/RubyIndexUnarchiver.scala | 6 ++- .../meta/unarchiver/RustIndexUnarchiver.scala | 2 +- .../unarchiver/StreamIndexUnarchiver.scala | 2 +- .../core/sources/PackageSourcesUpdater.scala | 19 ++++---- .../downloader/HaskellSourceDownloader.scala | 19 -------- .../downloader/HaskellSourcesDownloader.scala | 30 ++++++++++++ .../JavaScriptSourcesDownloader.scala | 30 ++++++++++++ .../RateLimitedSourcesDownloader.scala | 48 +++++++++++++++++++ .../downloader/RubySourcesDownloader.scala | 30 ++++++++++++ .../downloader/RustSourcesDownloader.scala | 30 ++++++++++++ .../downloader/SourcesDownloader.scala | 18 +++---- .../core/sources/filter/FileFilter.scala | 6 +-- .../core/sources/filter/FileFilters.scala | 30 ------------ .../sources/ratelimiter/RateLimiter.scala | 9 ++++ project/Builder.scala | 1 + 21 files changed, 248 insertions(+), 108 deletions(-) delete mode 100644 core/src/main/scala/codesearch/core/sources/downloader/HaskellSourceDownloader.scala create mode 100644 core/src/main/scala/codesearch/core/sources/downloader/HaskellSourcesDownloader.scala create mode 100644 core/src/main/scala/codesearch/core/sources/downloader/JavaScriptSourcesDownloader.scala create mode 100644 core/src/main/scala/codesearch/core/sources/downloader/RateLimitedSourcesDownloader.scala create mode 100644 core/src/main/scala/codesearch/core/sources/downloader/RubySourcesDownloader.scala create mode 100644 core/src/main/scala/codesearch/core/sources/downloader/RustSourcesDownloader.scala delete mode 100644 core/src/main/scala/codesearch/core/sources/filter/FileFilters.scala create mode 100644 core/src/main/scala/codesearch/core/sources/ratelimiter/RateLimiter.scala diff --git a/core/src/main/scala/codesearch/core/config/Config.scala b/core/src/main/scala/codesearch/core/config/Config.scala index 085de2a..9ad9114 100644 --- a/core/src/main/scala/codesearch/core/config/Config.scala +++ b/core/src/main/scala/codesearch/core/config/Config.scala @@ -10,7 +10,6 @@ import pureconfig.{CamelCase, ConfigFieldMapping, ProductHint} trait RepositoryConfig { def repository: String def repoIndexUrl: URI - def packageUrl: String } trait ArchivedIndexConfig extends RepositoryConfig { @@ -54,44 +53,55 @@ case class LanguagesConfig( case class HaskellConfig( repository: String, repoIndexUrl: URI, - packageUrl: String, repoArchivePath: Path, repoPath: Path, - concurrentTasksCount: Int + downloaderConfig: PackageDownloaderConfig ) extends ArchivedIndexConfig case class RubyConfig( repository: String, repoIndexUrl: URI, - packageUrl: String, repoArchivePath: Path, repoJsonPath: Path, scriptPath: Path, - concurrentTasksCount: Int + downloaderConfig: PackageDownloaderConfig ) extends ArchivedIndexConfig case class RustConfig( repository: String, repoIndexUrl: URI, - packageUrl: String, repoArchivePath: Path, repoPath: Path, - concurrentTasksCount: Int, - ignoreFiles: Set[String] + ignoreFiles: Set[String], + downloaderConfig: PackageDownloaderConfig ) extends ArchivedIndexConfig case class JavaScriptConfig( repository: String, repoIndexUrl: URI, - packageUrl: String, - concurrentTasksCount: Int + downloaderConfig: PackageDownloaderConfig ) extends RepositoryConfig -case class SourceFilesExtensions( - commonExtensions: Set[String], - sourcesExtensions: Set[String] +case class SourcesFilterConfig( + allowedFileNames: Set[String] +) + +case class IndexDownloaderConfig( + + ) + +case class PackageDownloaderConfig( + packageUrl: String, + packageArchivePath: String, + packageSourcesPath: String, + filterConfig: SourcesFilterConfig ) +case class RateLimitConfig( + numberTasks: Int, + seconds + ) + case class MetricsConfig( enableMatomoMetrics: Boolean ) diff --git a/core/src/main/scala/codesearch/core/db/repository/PackageDbRepository.scala b/core/src/main/scala/codesearch/core/db/repository/PackageDbRepository.scala index 967ccc8..04d1da7 100644 --- a/core/src/main/scala/codesearch/core/db/repository/PackageDbRepository.scala +++ b/core/src/main/scala/codesearch/core/db/repository/PackageDbRepository.scala @@ -17,16 +17,16 @@ final case class PackageTableRow( ) trait PackageDbRepository[F[_]] { - def upsert(`package`: PackageTableRow): F[Int] + def upsert(name: String, version: String, repository: String): F[Int] def findByRepository(repository: String): Stream[F, Package] } object PackageDbRepository { def apply[F[_]: Monad](xa: Transactor[F]): PackageDbRepository[F] = new PackageDbRepository[F] { - def upsert(`package`: PackageTableRow): F[Int] = { + def upsert(name: String, version: String, repository: String): F[Int] = { sql""" INSERT INTO package(name, version, repository, updated_at) - VALUES (${`package`.name}, ${`package`.version}, ${`package`.repository}, now()) + VALUES ($name, $version, $repository, now()) ON CONFLICT (name, repository) DO UPDATE SET version = excluded.version, updated_at = excluded.updated_at diff --git a/core/src/main/scala/codesearch/core/db/repository/PackageIndexDbRepository.scala b/core/src/main/scala/codesearch/core/db/repository/PackageIndexDbRepository.scala index 625f57b..6a5f6f5 100644 --- a/core/src/main/scala/codesearch/core/db/repository/PackageIndexDbRepository.scala +++ b/core/src/main/scala/codesearch/core/db/repository/PackageIndexDbRepository.scala @@ -17,14 +17,14 @@ final case class PackageIndex( version: String ) -trait PackageIndexDbRepository[F[_]] { +trait PackageIndexRepository[F[_]] { def batchUpsert(packages: List[PackageIndexTableRow]): F[Int] def batchUpsert(stream: Stream[F, PackageIndexTableRow]): F[Int] - def findNew(repository: String): Stream[F, PackageIndex] + def findNewByRepository(repository: String): Stream[F, PackageIndexTableRow] } object PackageIndexDbRepository { - def apply[F[_]: Monad](xa: Transactor[F]): PackageIndexDbRepository[F] = new PackageIndexDbRepository[F] { + def apply[F[_]: Monad](xa: Transactor[F]): PackageIndexRepository[F] = new PackageIndexRepository[F] { def batchUpsert(packages: List[PackageIndexTableRow]): F[Int] = { Update[PackageIndexTableRow]( @@ -46,13 +46,13 @@ object PackageIndexDbRepository { .drain } - def findNew(repository: String): Stream[F, PackageIndexTableRow] = { + def findNewByRepository(repository: String): Stream[F, PackageIndexTableRow] = { sql""" SELECT r.name, r.version, r.repository FROM repository_index r LEFT JOIN package p ON r.name <> p.name AND r.version <> p.version - """.query[PackageIndex].stream.transact(xa) + """.query[PackageIndexTableRow].stream.transact(xa) } } } diff --git a/core/src/main/scala/codesearch/core/meta/downloader/ArchivedIndexDownloader.scala b/core/src/main/scala/codesearch/core/meta/downloader/ArchivedIndexDownloader.scala index 7311d29..d0e4a74 100644 --- a/core/src/main/scala/codesearch/core/meta/downloader/ArchivedIndexDownloader.scala +++ b/core/src/main/scala/codesearch/core/meta/downloader/ArchivedIndexDownloader.scala @@ -4,7 +4,7 @@ import cats.effect.{ContextShift, Sync} import cats.syntax.flatMap._ import cats.syntax.functor._ import codesearch.core.config.ArchivedIndexConfig -import codesearch.core.db.repository.PackageIndexDbRepository +import codesearch.core.db.repository.PackageIndexRepository import codesearch.core.index.repository.Downloader import codesearch.core.meta.unarchiver.StreamIndexUnarchiver import com.softwaremill.sttp.Uri @@ -15,17 +15,17 @@ private[meta] class ArchivedIndexDownloader[F[_]: Sync: ContextShift]( config: ArchivedIndexConfig, downloader: Downloader[F], unarchiver: StreamIndexUnarchiver[F], - indexRep: PackageIndexDbRepository[F], + indexRepository: PackageIndexRepository[F], logger: Logger[F] ) extends RepositoryIndexDownloader[F] { def download: F[Unit] = for { - _ <- logger.info(s"Downloading ${config.repository} meta information") - path <- downloader.download(Uri(config.repoIndexUrl), config.repoArchivePath) - stream <- unarchiver.unarchive(path) - _ <- indexRep.batchUpsert(stream) - _ <- Sync[F].delay(FileUtils.cleanDirectory(config.repoArchivePath.getParent.toFile)) - _ <- logger.info("Downloading finished") + _ <- logger.info(s"Downloading ${config.repository} meta information") + archive <- downloader.download(Uri(config.repoIndexUrl), config.repoArchivePath) + stream <- unarchiver.unarchiveToStream(archive) + _ <- indexRepository.batchUpsert(stream) + _ <- Sync[F].delay(FileUtils.cleanDirectory(config.repoArchivePath.getParent.toFile)) + _ <- logger.info("Downloading finished") } yield () } diff --git a/core/src/main/scala/codesearch/core/meta/downloader/ByteStreamIndexDownloader.scala b/core/src/main/scala/codesearch/core/meta/downloader/ByteStreamIndexDownloader.scala index 1744309..33dd388 100644 --- a/core/src/main/scala/codesearch/core/meta/downloader/ByteStreamIndexDownloader.scala +++ b/core/src/main/scala/codesearch/core/meta/downloader/ByteStreamIndexDownloader.scala @@ -5,7 +5,7 @@ import cats.syntax.applicative._ import cats.syntax.flatMap._ import cats.syntax.functor._ import codesearch.core.config.RepositoryConfig -import codesearch.core.db.repository.PackageIndexDbRepository +import codesearch.core.db.repository.PackageIndexRepository import codesearch.core.index.repository.Downloader import codesearch.core.meta.parser.IndexByteStreamParser import com.softwaremill.sttp.Uri @@ -14,7 +14,7 @@ import io.chrisdavenport.log4cats.Logger private[meta] class ByteStreamIndexDownloader[F[_]: Sync: ContextShift]( config: RepositoryConfig, downloader: Downloader[F], - indexRep: PackageIndexDbRepository[F], + indexDbRepository: PackageIndexRepository[F], indexParser: IndexByteStreamParser[F], logger: Logger[F] ) extends RepositoryIndexDownloader[F] { @@ -24,7 +24,7 @@ private[meta] class ByteStreamIndexDownloader[F[_]: Sync: ContextShift]( _ <- logger.info(s"Downloading ${config.repository} meta information") stream <- downloader.download(Uri(config.repoIndexUrl)).pure[F].widen index <- indexParser.parse(stream) - _ <- indexRep.batchUpsert(index) + _ <- indexDbRepository.batchUpsert(index) _ <- logger.info("Downloading finished") } yield () } diff --git a/core/src/main/scala/codesearch/core/meta/unarchiver/HaskellIndexUnarchiver.scala b/core/src/main/scala/codesearch/core/meta/unarchiver/HaskellIndexUnarchiver.scala index 5d66923..df0ad85 100644 --- a/core/src/main/scala/codesearch/core/meta/unarchiver/HaskellIndexUnarchiver.scala +++ b/core/src/main/scala/codesearch/core/meta/unarchiver/HaskellIndexUnarchiver.scala @@ -20,7 +20,7 @@ private[meta] final class HaskellIndexUnarchiver[F[_]: Sync]( config: HaskellConfig ) extends StreamIndexUnarchiver[F] { - def unarchive(path: Path): F[Stream[F, PackageIndexTableRow]] = { + def unarchiveToStream(path: Path): F[Stream[F, PackageIndexTableRow]] = { for { _ <- unarchiver.extract(path, config.repoPath, TAR, GZIP) } yield flatPackages diff --git a/core/src/main/scala/codesearch/core/meta/unarchiver/RubyIndexUnarchiver.scala b/core/src/main/scala/codesearch/core/meta/unarchiver/RubyIndexUnarchiver.scala index a667668..2c9baa0 100644 --- a/core/src/main/scala/codesearch/core/meta/unarchiver/RubyIndexUnarchiver.scala +++ b/core/src/main/scala/codesearch/core/meta/unarchiver/RubyIndexUnarchiver.scala @@ -13,9 +13,11 @@ import io.circe.fs2.{byteArrayParser, decoder} import scala.sys.process._ -private[meta] final class RubyIndexUnarchiver[F[_]: Sync: ContextShift](config: RubyConfig) extends StreamIndexUnarchiver[F] { +private[meta] final class RubyIndexUnarchiver[F[_]: Sync: ContextShift]( + config: RubyConfig +) extends StreamIndexUnarchiver[F] { - def unarchive(path: Path): F[Stream[F, PackageIndexTableRow]] = { + def unarchiveToStream(path: Path): F[Stream[F, PackageIndexTableRow]] = { for { _ <- Sync[F].delay { Seq( diff --git a/core/src/main/scala/codesearch/core/meta/unarchiver/RustIndexUnarchiver.scala b/core/src/main/scala/codesearch/core/meta/unarchiver/RustIndexUnarchiver.scala index 2b73c5f..9f29090 100644 --- a/core/src/main/scala/codesearch/core/meta/unarchiver/RustIndexUnarchiver.scala +++ b/core/src/main/scala/codesearch/core/meta/unarchiver/RustIndexUnarchiver.scala @@ -24,7 +24,7 @@ private[meta] final class RustIndexUnarchiver[F[_]: Sync]( } yield PackageIndexTableRow(name, version, config.repository) } - def unarchive(path: Path): F[Stream[F, PackageIndexTableRow]] = { + def unarchiveToStream(path: Path): F[Stream[F, PackageIndexTableRow]] = { for { _ <- unarchiver.extract(path, config.repoPath, ZIP) } yield flatPackages diff --git a/core/src/main/scala/codesearch/core/meta/unarchiver/StreamIndexUnarchiver.scala b/core/src/main/scala/codesearch/core/meta/unarchiver/StreamIndexUnarchiver.scala index bd2b358..319606b 100644 --- a/core/src/main/scala/codesearch/core/meta/unarchiver/StreamIndexUnarchiver.scala +++ b/core/src/main/scala/codesearch/core/meta/unarchiver/StreamIndexUnarchiver.scala @@ -6,5 +6,5 @@ import codesearch.core.db.repository.PackageIndexTableRow import fs2.Stream private[meta] trait StreamIndexUnarchiver[F[_]] { - def unarchive(path: Path): F[Stream[F, PackageIndexTableRow]] + def unarchiveToStream(path: Path): F[Stream[F, PackageIndexTableRow]] } diff --git a/core/src/main/scala/codesearch/core/sources/PackageSourcesUpdater.scala b/core/src/main/scala/codesearch/core/sources/PackageSourcesUpdater.scala index 09a34f6..bcb0ef0 100644 --- a/core/src/main/scala/codesearch/core/sources/PackageSourcesUpdater.scala +++ b/core/src/main/scala/codesearch/core/sources/PackageSourcesUpdater.scala @@ -2,9 +2,7 @@ package codesearch.core.sources import cats.effect.Sync import codesearch.core.config.RepositoryConfig -import cats.syntax.applicative._ -import cats.syntax.functor._ -import codesearch.core.db.repository.{Package, PackageIndexDbRepository} +import codesearch.core.db.repository.PackageIndexDbRepository import codesearch.core.sources.downloader.SourcesDownloader trait SourcesUpdater[F[_]] { @@ -13,16 +11,15 @@ trait SourcesUpdater[F[_]] { class PackageSourcesUpdater[F[_]: Sync]( config: RepositoryConfig, - indexRep: PackageIndexDbRepository[F], - downloader: SourcesDownloader[F, A] + indexDbRepository: PackageIndexDbRepository[F], + downloader: SourcesDownloader[F] ) extends SourcesUpdater[F] { def update: F[Unit] = { - for { - newIndexes <- indexRep.findNew(config.repository).pure[F] - - } yield () + indexDbRepository + .findNewByRepository(config.repository) + .map(downloader.download) + .compile + .drain } - - private def download(`package`: Package): F[Unit] = {} } diff --git a/core/src/main/scala/codesearch/core/sources/downloader/HaskellSourceDownloader.scala b/core/src/main/scala/codesearch/core/sources/downloader/HaskellSourceDownloader.scala deleted file mode 100644 index 5844cff..0000000 --- a/core/src/main/scala/codesearch/core/sources/downloader/HaskellSourceDownloader.scala +++ /dev/null @@ -1,19 +0,0 @@ -package codesearch.core.sources.downloader - -import codesearch.core.index.repository.Downloader -import cats.syntax.functor._ -import codesearch.core.config.HaskellConfig -import io.chrisdavenport.log4cats.slf4j.Slf4jLogger - -object HaskellSourceDownloader { - def apply[F[_]]( - downloader: Downloader[F], - config: HaskellConfig - ): SourcesDownloader[F] = - for { - logger <- Slf4jLogger.create - } yield SourcesDownloader( - downloader, - - ) -} diff --git a/core/src/main/scala/codesearch/core/sources/downloader/HaskellSourcesDownloader.scala b/core/src/main/scala/codesearch/core/sources/downloader/HaskellSourcesDownloader.scala new file mode 100644 index 0000000..2b1f108 --- /dev/null +++ b/core/src/main/scala/codesearch/core/sources/downloader/HaskellSourcesDownloader.scala @@ -0,0 +1,30 @@ +package codesearch.core.sources.downloader + +import cats.effect.Sync +import cats.syntax.functor._ +import codesearch.core.config.PackageDownloaderConfig +import codesearch.core.db.repository.PackageDbRepository +import codesearch.core.index.repository.Downloader +import codesearch.core.index.repository.Extensions.HaskellExtensions +import codesearch.core.sources.filter.FileFilter +import codesearch.core.sources.unarchiver.SourcesUnarchiver +import io.chrisdavenport.log4cats.slf4j.Slf4jLogger + +object HaskellSourcesDownloader { + def apply[F[_]: Sync]( + downloader: Downloader[F], + packageDbRepository: PackageDbRepository[F], + downloaderConfig: PackageDownloaderConfig + ): F[SourcesDownloader[F]] = + for { + logger <- Slf4jLogger.create + } yield + SourcesDownloader( + downloader, + SourcesUnarchiver[F], + FileFilter[F](HaskellExtensions, downloaderConfig.filterConfig.allowedFileNames), + packageDbRepository, + downloaderConfig, + logger + ) +} diff --git a/core/src/main/scala/codesearch/core/sources/downloader/JavaScriptSourcesDownloader.scala b/core/src/main/scala/codesearch/core/sources/downloader/JavaScriptSourcesDownloader.scala new file mode 100644 index 0000000..bedfb3f --- /dev/null +++ b/core/src/main/scala/codesearch/core/sources/downloader/JavaScriptSourcesDownloader.scala @@ -0,0 +1,30 @@ +package codesearch.core.sources.downloader + +import cats.effect.Sync +import cats.syntax.functor._ +import codesearch.core.config.PackageDownloaderConfig +import codesearch.core.db.repository.PackageDbRepository +import codesearch.core.index.repository.Downloader +import codesearch.core.index.repository.Extensions.JavaScriptExtensions +import codesearch.core.sources.filter.FileFilter +import codesearch.core.sources.unarchiver.SourcesUnarchiver +import io.chrisdavenport.log4cats.slf4j.Slf4jLogger + +object JavaScriptSourcesDownloader { + def apply[F[_]: Sync]( + downloader: Downloader[F], + packageDbRepository: PackageDbRepository[F], + downloaderConfig: PackageDownloaderConfig + ): F[SourcesDownloader[F]] = + for { + logger <- Slf4jLogger.create + } yield + SourcesDownloader( + downloader, + SourcesUnarchiver[F], + FileFilter[F](JavaScriptExtensions, downloaderConfig.filterConfig.allowedFileNames), + packageDbRepository, + downloaderConfig, + logger + ) +} diff --git a/core/src/main/scala/codesearch/core/sources/downloader/RateLimitedSourcesDownloader.scala b/core/src/main/scala/codesearch/core/sources/downloader/RateLimitedSourcesDownloader.scala new file mode 100644 index 0000000..bc2aa90 --- /dev/null +++ b/core/src/main/scala/codesearch/core/sources/downloader/RateLimitedSourcesDownloader.scala @@ -0,0 +1,48 @@ +package codesearch.core.sources.downloader + +import java.nio.file.Paths + +import cats.effect.{Concurrent, Sync, Timer} +import codesearch.core.config.PackageDownloaderConfig +import codesearch.core.db.repository.{PackageDbRepository, PackageIndexTableRow} +import codesearch.core.index.repository.Downloader +import codesearch.core.sources.filter.FileFilter +import codesearch.core.sources.unarchiver.SourcesUnarchiver +import com.softwaremill.sttp.Uri +import io.chrisdavenport.log4cats.Logger +import upperbound.{Limiter, Rate} +import cats.effect.concurrent.Deferred +import upperbound.syntax.rate._ + + +import scala.concurrent.duration._ + +object RateLimitedSourcesDownloader { + def apply[F[_]: Concurrent: Timer]( + downloader: Downloader[F], + unarchiver: SourcesUnarchiver[F], + fileFilter: FileFilter[F], + packageDbRepository: PackageDbRepository[F], + config: PackageDownloaderConfig, + logger: Logger[F] + ): SourcesDownloader[F] = new SourcesDownloader[F] { + + def download(index: PackageIndexTableRow): F[Unit] = { + val packageUrl = Uri(config.packageUrl.format(index.name, index.version)) + val archivePath = Paths.get(config.packageArchivePath.format(index.name, index.version)) + val sourcesPath = Paths.get(config.packageSourcesPath.format(index.name, index.version)) + for { + _ <- logger.info(s"Downloading ${index.name}-${index.version} sources") + archive <- downloader.download(packageUrl, archivePath) + sourcesDir <- unarchiver.unarchive(archive, sourcesPath) + _ <- fileFilter.filter(sourcesDir) + _ <- packageDbRepository.upsert(index.name, index.version, index.repository) + } yield () + } + + private def rateLimitedDownload: F[Unit] = { + Limiter.start[F](maxRate = 10 every 1.seconds).use { implicit limiter => + } + } + } +} diff --git a/core/src/main/scala/codesearch/core/sources/downloader/RubySourcesDownloader.scala b/core/src/main/scala/codesearch/core/sources/downloader/RubySourcesDownloader.scala new file mode 100644 index 0000000..87aac24 --- /dev/null +++ b/core/src/main/scala/codesearch/core/sources/downloader/RubySourcesDownloader.scala @@ -0,0 +1,30 @@ +package codesearch.core.sources.downloader + +import cats.effect.Sync +import cats.syntax.functor._ +import codesearch.core.config.PackageDownloaderConfig +import codesearch.core.db.repository.PackageDbRepository +import codesearch.core.index.repository.Downloader +import codesearch.core.index.repository.Extensions.RubyExtensions +import codesearch.core.sources.filter.FileFilter +import codesearch.core.sources.unarchiver.RubySourcesUnarchiver +import io.chrisdavenport.log4cats.slf4j.Slf4jLogger + +object RubySourcesDownloader { + def apply[F[_]: Sync]( + downloader: Downloader[F], + packageDbRepository: PackageDbRepository[F], + downloaderConfig: PackageDownloaderConfig + ): F[SourcesDownloader[F]] = + for { + logger <- Slf4jLogger.create + } yield + SourcesDownloader( + downloader, + RubySourcesUnarchiver[F], + FileFilter[F](RubyExtensions, downloaderConfig.filterConfig.allowedFileNames), + packageDbRepository, + downloaderConfig, + logger + ) +} diff --git a/core/src/main/scala/codesearch/core/sources/downloader/RustSourcesDownloader.scala b/core/src/main/scala/codesearch/core/sources/downloader/RustSourcesDownloader.scala new file mode 100644 index 0000000..e31b495 --- /dev/null +++ b/core/src/main/scala/codesearch/core/sources/downloader/RustSourcesDownloader.scala @@ -0,0 +1,30 @@ +package codesearch.core.sources.downloader + +import cats.effect.Sync +import cats.syntax.functor._ +import codesearch.core.config.PackageDownloaderConfig +import codesearch.core.db.repository.PackageDbRepository +import codesearch.core.index.repository.Downloader +import codesearch.core.index.repository.Extensions.RustExtensions +import codesearch.core.sources.filter.FileFilter +import codesearch.core.sources.unarchiver.SourcesUnarchiver +import io.chrisdavenport.log4cats.slf4j.Slf4jLogger + +object RustSourcesDownloader { + def apply[F[_]: Sync]( + downloader: Downloader[F], + packageDbRepository: PackageDbRepository[F], + downloaderConfig: PackageDownloaderConfig + ): F[SourcesDownloader[F]] = + for { + logger <- Slf4jLogger.create + } yield + SourcesDownloader( + downloader, + SourcesUnarchiver[F], + FileFilter[F](RustExtensions, downloaderConfig.filterConfig.allowedFileNames), + packageDbRepository, + downloaderConfig, + logger + ) +} diff --git a/core/src/main/scala/codesearch/core/sources/downloader/SourcesDownloader.scala b/core/src/main/scala/codesearch/core/sources/downloader/SourcesDownloader.scala index b4ec7f8..4cc39d1 100644 --- a/core/src/main/scala/codesearch/core/sources/downloader/SourcesDownloader.scala +++ b/core/src/main/scala/codesearch/core/sources/downloader/SourcesDownloader.scala @@ -5,8 +5,8 @@ import java.nio.file.Paths import cats.effect.Sync import cats.syntax.flatMap._ import cats.syntax.functor._ -import codesearch.core.config.RepositoryConfig -import codesearch.core.db.repository.{PackageDbRepository, PackageIndexTableRow, PackageTableRow} +import codesearch.core.config.PackageDownloaderConfig +import codesearch.core.db.repository.{PackageDbRepository, PackageIndexTableRow} import codesearch.core.index.repository.Downloader import codesearch.core.sources.filter.FileFilter import codesearch.core.sources.unarchiver.SourcesUnarchiver @@ -14,7 +14,7 @@ import com.softwaremill.sttp._ import io.chrisdavenport.log4cats.Logger trait SourcesDownloader[F[_]] { - def downloadSources(packageIndex: PackageIndexTableRow): F[Unit] + def download(index: PackageIndexTableRow): F[Unit] } object SourcesDownloader { @@ -23,16 +23,18 @@ object SourcesDownloader { unarchiver: SourcesUnarchiver[F], fileFilter: FileFilter[F], packageDbRepository: PackageDbRepository[F], - config: RepositoryConfig, + config: PackageDownloaderConfig, logger: Logger[F] ): SourcesDownloader[F] = (index: PackageIndexTableRow) => { - val packageUrl = uri"${config.packageUrl.format(index.name, index.version)}" + val packageUrl = Uri(config.packageUrl.format(index.name, index.version)) + val archivePath = Paths.get(config.packageArchivePath.format(index.name, index.version)) + val sourcesPath = Paths.get(config.packageSourcesPath.format(index.name, index.version)) for { _ <- logger.info(s"Downloading ${index.name}-${index.version} sources") - archive <- downloader.download(packageUrl, Paths.get("")) - sourcesDir <- unarchiver.unarchive(archive) + archive <- downloader.download(packageUrl, archivePath) + sourcesDir <- unarchiver.unarchive(archive, sourcesPath) _ <- fileFilter.filter(sourcesDir) - _ <- packageDbRepository.upsert(PackageTableRow(index.name, index.version, index.repository)) + _ <- packageDbRepository.upsert(index.name, index.version, index.repository) } yield () } } diff --git a/core/src/main/scala/codesearch/core/sources/filter/FileFilter.scala b/core/src/main/scala/codesearch/core/sources/filter/FileFilter.scala index e510933..2bd03ad 100644 --- a/core/src/main/scala/codesearch/core/sources/filter/FileFilter.scala +++ b/core/src/main/scala/codesearch/core/sources/filter/FileFilter.scala @@ -24,13 +24,13 @@ object FileFilter { private val maxFileSize: Int = 1024 * 1024 - def filter(dir: Path): F[Int] = Sync[F].delay(deleteRecursively(dir.toFile, filter)) + def filter(dir: Path): F[Int] = Sync[F].delay(filterRecursively(dir.toFile, filter)) - private def deleteRecursively(dir: File, predicate: File => Boolean): F[Int] = { + private def filterRecursively(dir: File, predicate: File => Boolean): F[Int] = { for { (dirs, files) <- Sync[F].delay(dir.listFiles.toList.partition(_.isDirectory)) filesDeleted <- files.filterNot(predicate).traverse(file => Sync[F].delay(file.delete)).map(_.size) - nestedFilesDeleted <- dirs.traverse(dir => deleteRecursively(dir, predicate)).map(_.size) + nestedFilesDeleted <- dirs.traverse(dir => filterRecursively(dir, predicate)).map(_.size) _ <- Sync[F].delay(dir.delete).whenA(dir.listFiles.isEmpty) } yield filesDeleted + nestedFilesDeleted } diff --git a/core/src/main/scala/codesearch/core/sources/filter/FileFilters.scala b/core/src/main/scala/codesearch/core/sources/filter/FileFilters.scala deleted file mode 100644 index 83f4bb9..0000000 --- a/core/src/main/scala/codesearch/core/sources/filter/FileFilters.scala +++ /dev/null @@ -1,30 +0,0 @@ -package codesearch.core.sources.filter - -import cats.effect.Sync -import codesearch.core.config.{HaskellConfig, JavaScriptConfig, RubyConfig, RustConfig} -import codesearch.core.index.repository.Extensions.{ - HaskellExtensions, - JavaScriptExtensions, - RubyExtensions, - RustExtensions -} - -object HaskellFileFilter { - def apply[F[_]: Sync](config: HaskellConfig): FileFilter[F] = - FileFilter[F](HaskellExtensions, Set[String]()) -} - -object JavaScriptFileFilter { - def apply[F[_]: Sync](config: JavaScriptConfig): FileFilter[F] = - FileFilter(JavaScriptExtensions, Set[String]()) -} - -object RubyFileFilter { - def apply[F[_]: Sync](config: RubyConfig): FileFilter[F] = - FileFilter(RubyExtensions, Set[String]()) -} - -object RustFileFilter { - def apply[F[_]: Sync](config: RustConfig): FileFilter[F] = - FileFilter(RustExtensions, Set[String]()) -} diff --git a/core/src/main/scala/codesearch/core/sources/ratelimiter/RateLimiter.scala b/core/src/main/scala/codesearch/core/sources/ratelimiter/RateLimiter.scala new file mode 100644 index 0000000..359fc93 --- /dev/null +++ b/core/src/main/scala/codesearch/core/sources/ratelimiter/RateLimiter.scala @@ -0,0 +1,9 @@ +package codesearch.core.sources.ratelimiter + +import scala.concurrent.duration.FiniteDuration + +case class Rate(numberTasks: Int, duration: FiniteDuration) + +trait RateLimiter { + +} \ No newline at end of file diff --git a/project/Builder.scala b/project/Builder.scala index 6a7491c..0e3414c 100644 --- a/project/Builder.scala +++ b/project/Builder.scala @@ -84,6 +84,7 @@ object Builder { "org.tpolecat" %% "doobie-postgres" % "0.6.0", "org.tpolecat" %% "doobie-specs2" % "0.6.0", "org.flywaydb" % "flyway-core" % "5.2.4", + "org.systemfw" % "upperbound_2.12" % "0.2.0-M2", ), assemblyMergeStrategy in assembly := { case PathList("META-INF", _ @_*) => MergeStrategy.discard From a3733c00c4fb0d7070cb62749c18f9a80bd545df Mon Sep 17 00:00:00 2001 From: kamilongus Date: Wed, 15 May 2019 03:26:35 +0400 Subject: [PATCH 6/7] + added SearchProvider trait --- .../codesearch/core/config/CindexConfig.scala | 6 ++ .../scala/codesearch/core/config/Config.scala | 26 ++++---- .../core/config/SourcesFilesConfig.scala | 12 ++++ .../repository/PackageIndexDbRepository.scala | 9 ++- .../core/search/SearchRequest.scala | 7 ++- .../core/search/engine/CodeSearcher.scala | 25 ++++++++ .../core/search/engine/SearchProvider.scala | 5 ++ .../search/engine/StreamSnippetGrouper.scala | 47 ++++++++++++++ .../engine/csearch/CsearchProvider.scala | 63 +++++++++++++++++++ .../core/sources/PackageSourcesUpdater.scala | 42 +++++++++---- .../RateLimitedSourcesDownloader.scala | 48 -------------- .../sources/ratelimiter/RateLimiter.scala | 9 --- 12 files changed, 212 insertions(+), 87 deletions(-) create mode 100644 core/src/main/scala/codesearch/core/config/CindexConfig.scala create mode 100644 core/src/main/scala/codesearch/core/config/SourcesFilesConfig.scala create mode 100644 core/src/main/scala/codesearch/core/search/engine/CodeSearcher.scala create mode 100644 core/src/main/scala/codesearch/core/search/engine/SearchProvider.scala create mode 100644 core/src/main/scala/codesearch/core/search/engine/StreamSnippetGrouper.scala create mode 100644 core/src/main/scala/codesearch/core/search/engine/csearch/CsearchProvider.scala delete mode 100644 core/src/main/scala/codesearch/core/sources/downloader/RateLimitedSourcesDownloader.scala delete mode 100644 core/src/main/scala/codesearch/core/sources/ratelimiter/RateLimiter.scala diff --git a/core/src/main/scala/codesearch/core/config/CindexConfig.scala b/core/src/main/scala/codesearch/core/config/CindexConfig.scala new file mode 100644 index 0000000..9d584e7 --- /dev/null +++ b/core/src/main/scala/codesearch/core/config/CindexConfig.scala @@ -0,0 +1,6 @@ +package codesearch.core.config + +case class CindexConfig( + indexDir: String, + tempIndexDir: String, +) diff --git a/core/src/main/scala/codesearch/core/config/Config.scala b/core/src/main/scala/codesearch/core/config/Config.scala index 9ad9114..5473778 100644 --- a/core/src/main/scala/codesearch/core/config/Config.scala +++ b/core/src/main/scala/codesearch/core/config/Config.scala @@ -82,25 +82,27 @@ case class JavaScriptConfig( downloaderConfig: PackageDownloaderConfig ) extends RepositoryConfig -case class SourcesFilterConfig( - allowedFileNames: Set[String] -) - -case class IndexDownloaderConfig( - - ) +case class SourcesUpdaterConfig() case class PackageDownloaderConfig( packageUrl: String, packageArchivePath: String, packageSourcesPath: String, - filterConfig: SourcesFilterConfig + filterConfig: SourcesFilterConfig, +) + +case class SourcesFilterConfig( + allowedFileNames: Set[String] ) -case class RateLimitConfig( - numberTasks: Int, - seconds - ) +case class SourcesExtraConfig( + testDirs: Set[String], +) + +case class RateLimiterConfig( + numberTasks: Int, + per: Int +) case class MetricsConfig( enableMatomoMetrics: Boolean diff --git a/core/src/main/scala/codesearch/core/config/SourcesFilesConfig.scala b/core/src/main/scala/codesearch/core/config/SourcesFilesConfig.scala new file mode 100644 index 0000000..b313410 --- /dev/null +++ b/core/src/main/scala/codesearch/core/config/SourcesFilesConfig.scala @@ -0,0 +1,12 @@ +package codesearch.core.config + +case class SourcesFilesConfig( + testDirsNames: Set[String], + allowedFileNames: Set[String], + filesExtensions: FilesExtensions +) + +case class FilesExtensions( + commonExtensions: Set[String], + sourceExtensions: Set[String], +) { def extensions: Set[String] = commonExtensions ++ sourceExtensions } diff --git a/core/src/main/scala/codesearch/core/db/repository/PackageIndexDbRepository.scala b/core/src/main/scala/codesearch/core/db/repository/PackageIndexDbRepository.scala index 6a5f6f5..62d0071 100644 --- a/core/src/main/scala/codesearch/core/db/repository/PackageIndexDbRepository.scala +++ b/core/src/main/scala/codesearch/core/db/repository/PackageIndexDbRepository.scala @@ -20,7 +20,7 @@ final case class PackageIndex( trait PackageIndexRepository[F[_]] { def batchUpsert(packages: List[PackageIndexTableRow]): F[Int] def batchUpsert(stream: Stream[F, PackageIndexTableRow]): F[Int] - def findNewByRepository(repository: String): Stream[F, PackageIndexTableRow] + def findLatestByRepository(repository: String): Stream[F, PackageIndexTableRow] } object PackageIndexDbRepository { @@ -37,16 +37,15 @@ object PackageIndexDbRepository { ).updateMany(packages).transact(xa) } - def batchUpsert(stream: Stream[F, PackageIndexTableRow]): F[Int] = { - val insertBatchSize = 10000 + def batchUpsert(stream: Stream[F, PackageIndexTableRow], batchSize: Int = 10000): F[Int] = { stream - .chunkN(insertBatchSize) + .chunkN(batchSize) .map(packages => batchUpsert(packages.toList)) .compile .drain } - def findNewByRepository(repository: String): Stream[F, PackageIndexTableRow] = { + def findLatestByRepository(repository: String): Stream[F, PackageIndexTableRow] = { sql""" SELECT r.name, r.version, r.repository FROM repository_index r diff --git a/core/src/main/scala/codesearch/core/search/SearchRequest.scala b/core/src/main/scala/codesearch/core/search/SearchRequest.scala index 82e3025..5669c3d 100644 --- a/core/src/main/scala/codesearch/core/search/SearchRequest.scala +++ b/core/src/main/scala/codesearch/core/search/SearchRequest.scala @@ -21,7 +21,8 @@ case class SearchRequest( spaceInsensitive: Boolean, preciseMatch: Boolean, sourcesOnly: Boolean, - page: Int + page: Int, + limit: Int ) { /** @@ -46,7 +47,8 @@ object SearchRequest { spaceInsensitive: String, preciseMatch: String, sourcesOnly: String, - page: String + page: String, + limit: String ): SearchRequest = { SearchRequest( lang, @@ -58,6 +60,7 @@ object SearchRequest { isEnabled(preciseMatch), isEnabled(sourcesOnly), page.toInt, + limit.toInt ) } diff --git a/core/src/main/scala/codesearch/core/search/engine/CodeSearcher.scala b/core/src/main/scala/codesearch/core/search/engine/CodeSearcher.scala new file mode 100644 index 0000000..99beb8c --- /dev/null +++ b/core/src/main/scala/codesearch/core/search/engine/CodeSearcher.scala @@ -0,0 +1,25 @@ +package codesearch.core.search.engine + +import cats.effect.Sync +import codesearch.core.search.SearchRequest +import codesearch.core.search.engine.csearch.MatchedRow +import cats.syntax.flatMap._ +import codesearch.core.config.SnippetConfig +import fs2.Stream + +sealed trait Response +case class ErrorResponse(message: String) extends Response +case class SuccessfulResponse[T](value: T) extends Response + +object CodeSearcher { + def apply[F[_]: Sync]( + csearchProvider: SearchProvider[F, SearchRequest, Stream[F, MatchedRow]], + snippetConfig: SnippetConfig + ): SearchProvider[F, SearchRequest, Response] = (request: SearchRequest) => { + val snippetGrouper = StreamSnippetGrouper(snippetConfig) + val matchedRows = csearchProvider.searchBy(request) + val a = matchedRows.flatMap { rows => + rows.through(snippetGrouper.group).through() + } + } +} diff --git a/core/src/main/scala/codesearch/core/search/engine/SearchProvider.scala b/core/src/main/scala/codesearch/core/search/engine/SearchProvider.scala new file mode 100644 index 0000000..76f99ae --- /dev/null +++ b/core/src/main/scala/codesearch/core/search/engine/SearchProvider.scala @@ -0,0 +1,5 @@ +package codesearch.core.search.engine + +trait SearchProvider[F[_], QueryParam, Result] { + def searchBy(param: QueryParam): F[Result] +} \ No newline at end of file diff --git a/core/src/main/scala/codesearch/core/search/engine/StreamSnippetGrouper.scala b/core/src/main/scala/codesearch/core/search/engine/StreamSnippetGrouper.scala new file mode 100644 index 0000000..8183c53 --- /dev/null +++ b/core/src/main/scala/codesearch/core/search/engine/StreamSnippetGrouper.scala @@ -0,0 +1,47 @@ +package codesearch.core.search.engine + +import cats.Applicative +import cats.data.NonEmptyVector +import codesearch.core.config.SnippetConfig +import codesearch.core.search.engine.csearch.MatchedRow +import fs2._ +import cats.instances.string._ + +/** + * Info about code snippet + * + * @param filePath absolute path to file + * @param lines numbers of matched lines in file + */ +case class SnippetInfo(filePath: String, lines: NonEmptyVector[Int]) + +final class StreamSnippetGrouper[F[_]: Applicative](config: SnippetConfig) { + + def group: Pipe[F, MatchedRow, SnippetInfo] = { matchedRows => + for { + (_, matchedRow) <- matchedRows.groupAdjacentBy(_.path) + snippets <- Stream.emits(groupToSnippets(matchedRow)) + } yield snippets + + def groupToSnippets(rows: Chunk[MatchedRow]): Seq[SnippetInfo] = { + rows.foldLeft(Vector.empty[SnippetInfo]) { (snippets, row) => + snippets.lastOption match { + case Some(snippet) => + if (row.lineNumber < snippet.lines.last + config.linesAfter) + snippets.init :+ snippet.copy(lines = snippet.lines :+ row.lineNumber) + else + snippets :+ SnippetInfo(row.path, NonEmptyVector.one(row.lineNumber)) + case None => + snippets :+ SnippetInfo(row.path, NonEmptyVector.one(row.lineNumber)) + } + } + } + } + +} + +object StreamSnippetGrouper { + def apply[F[_]: Applicative]( + config: SnippetConfig + ): StreamSnippetGrouper[F] = new StreamSnippetGrouper[F](config) +} diff --git a/core/src/main/scala/codesearch/core/search/engine/csearch/CsearchProvider.scala b/core/src/main/scala/codesearch/core/search/engine/csearch/CsearchProvider.scala new file mode 100644 index 0000000..350e017 --- /dev/null +++ b/core/src/main/scala/codesearch/core/search/engine/csearch/CsearchProvider.scala @@ -0,0 +1,63 @@ +package codesearch.core.search.engine.csearch + +import cats.effect.Sync +import cats.syntax.flatMap._ +import cats.syntax.functor._ +import cats.syntax.applicative._ +import codesearch.core.index.directory.СindexDirectory +import codesearch.core.index.repository.Extensions +import codesearch.core.regex.RegexConstructor +import codesearch.core.search.SearchRequest +import codesearch.core.search.engine.SearchProvider +import fs2.{Pipe, Stream} +import io.chrisdavenport.log4cats.Logger + +import scala.sys.process.Process + +case class MatchedRow(path: String, lineNumber: Int) + +object CsearchProvider { + def apply[F[_]: Sync]( + cindex: СindexDirectory, + extensions: Extensions, + logger: Logger[F] + ): SearchProvider[F, SearchRequest, Stream[F, MatchedRow]] = (request: SearchRequest) => { + + val indexDir = cindex.indexDirAs[String] + val environment = ("CSEARCHINDEX", indexDir) + val pipe = Seq("head", s"-${request.limit}") + val process = Process(arguments(request), None, environment) #| pipe + + def parse: Pipe[F, String, MatchedRow] = { lines => + lines.map { row => + val Array(path, lineNumber) = row.split(":").take(2) //filePath:lineNumber:matchedString + MatchedRow(path, lineNumber.toInt) + } + } + + def arguments(request: SearchRequest): Seq[String] = { + val forExtensions: String = request.filePath match { + case Some(filePath) => filePath + case None => + if (request.sourcesOnly) { + //val testDirsRegexp = ".*(!(\\/test|\\/spec|\\/tests))" + val sourcesExtensionsRegexp = extensions.sourceExtensions.mkString(".*\\.(", "|", ")$") + sourcesExtensionsRegexp + } else ".*" + } + + val regex = RegexConstructor(request.query, request.insensitive, request.spaceInsensitive, request.preciseMatch) + + request.filter match { + case Some(filter) => Seq("csearch", "-n", "-f", forExtensions, regex, filter) + case None => Seq("csearch", "-n", "-f", forExtensions, regex) + } + } + + for { + _ <- logger.debug(s"running CSEARCHINDEX=$indexDir ${arguments(request).mkString(" ")}") + resultRows <- Sync[F].delay(process.lineStream.toList) + parsedResultRows <- Stream.emits(resultRows).through(parse).pure[F] + } yield parsedResultRows + } +} diff --git a/core/src/main/scala/codesearch/core/sources/PackageSourcesUpdater.scala b/core/src/main/scala/codesearch/core/sources/PackageSourcesUpdater.scala index bcb0ef0..9808734 100644 --- a/core/src/main/scala/codesearch/core/sources/PackageSourcesUpdater.scala +++ b/core/src/main/scala/codesearch/core/sources/PackageSourcesUpdater.scala @@ -1,25 +1,45 @@ package codesearch.core.sources -import cats.effect.Sync -import codesearch.core.config.RepositoryConfig -import codesearch.core.db.repository.PackageIndexDbRepository +import cats.effect.concurrent.Deferred +import cats.effect.{Concurrent, Sync, Timer} +import codesearch.core.config.{RateLimiterConfig, RepositoryConfig} +import codesearch.core.db.repository.PackageIndexRepository import codesearch.core.sources.downloader.SourcesDownloader +import upperbound.syntax.rate._ +import fs2.Stream +import upperbound.{Limiter, Rate} +import cats.syntax.all._ + +import scala.concurrent.duration._ trait SourcesUpdater[F[_]] { def update: F[Unit] } -class PackageSourcesUpdater[F[_]: Sync]( - config: RepositoryConfig, - indexDbRepository: PackageIndexDbRepository[F], - downloader: SourcesDownloader[F] +class PackageSourcesUpdater[F[_]: Concurrent: Timer]( + indexDbRepository: PackageIndexRepository[F], + downloader: SourcesDownloader[F], + rateLimiterConfig: Option[RateLimiterConfig] = None ) extends SourcesUpdater[F] { def update: F[Unit] = { + val latestPackages = indexDbRepository.findLatestByRepository(config.repository) + rateLimiterConfig match { + case Some(rate) => + } + + for { + d <- Deferred[F, A] + } Limiter.start[F](maxRate = 10 every 1.seconds).use { limiter => + limiter.submit() + + } + + } + + private def updateStream: Stream[F, F[Unit]] = indexDbRepository - .findNewByRepository(config.repository) + .findLatestByRepository(config.repository) .map(downloader.download) - .compile - .drain - } + } diff --git a/core/src/main/scala/codesearch/core/sources/downloader/RateLimitedSourcesDownloader.scala b/core/src/main/scala/codesearch/core/sources/downloader/RateLimitedSourcesDownloader.scala deleted file mode 100644 index bc2aa90..0000000 --- a/core/src/main/scala/codesearch/core/sources/downloader/RateLimitedSourcesDownloader.scala +++ /dev/null @@ -1,48 +0,0 @@ -package codesearch.core.sources.downloader - -import java.nio.file.Paths - -import cats.effect.{Concurrent, Sync, Timer} -import codesearch.core.config.PackageDownloaderConfig -import codesearch.core.db.repository.{PackageDbRepository, PackageIndexTableRow} -import codesearch.core.index.repository.Downloader -import codesearch.core.sources.filter.FileFilter -import codesearch.core.sources.unarchiver.SourcesUnarchiver -import com.softwaremill.sttp.Uri -import io.chrisdavenport.log4cats.Logger -import upperbound.{Limiter, Rate} -import cats.effect.concurrent.Deferred -import upperbound.syntax.rate._ - - -import scala.concurrent.duration._ - -object RateLimitedSourcesDownloader { - def apply[F[_]: Concurrent: Timer]( - downloader: Downloader[F], - unarchiver: SourcesUnarchiver[F], - fileFilter: FileFilter[F], - packageDbRepository: PackageDbRepository[F], - config: PackageDownloaderConfig, - logger: Logger[F] - ): SourcesDownloader[F] = new SourcesDownloader[F] { - - def download(index: PackageIndexTableRow): F[Unit] = { - val packageUrl = Uri(config.packageUrl.format(index.name, index.version)) - val archivePath = Paths.get(config.packageArchivePath.format(index.name, index.version)) - val sourcesPath = Paths.get(config.packageSourcesPath.format(index.name, index.version)) - for { - _ <- logger.info(s"Downloading ${index.name}-${index.version} sources") - archive <- downloader.download(packageUrl, archivePath) - sourcesDir <- unarchiver.unarchive(archive, sourcesPath) - _ <- fileFilter.filter(sourcesDir) - _ <- packageDbRepository.upsert(index.name, index.version, index.repository) - } yield () - } - - private def rateLimitedDownload: F[Unit] = { - Limiter.start[F](maxRate = 10 every 1.seconds).use { implicit limiter => - } - } - } -} diff --git a/core/src/main/scala/codesearch/core/sources/ratelimiter/RateLimiter.scala b/core/src/main/scala/codesearch/core/sources/ratelimiter/RateLimiter.scala deleted file mode 100644 index 359fc93..0000000 --- a/core/src/main/scala/codesearch/core/sources/ratelimiter/RateLimiter.scala +++ /dev/null @@ -1,9 +0,0 @@ -package codesearch.core.sources.ratelimiter - -import scala.concurrent.duration.FiniteDuration - -case class Rate(numberTasks: Int, duration: FiniteDuration) - -trait RateLimiter { - -} \ No newline at end of file From 8f886b9fc49fd2fe7442d69c52305bd4e4506156 Mon Sep 17 00:00:00 2001 From: kamilongus Date: Thu, 16 May 2019 20:56:40 +0400 Subject: [PATCH 7/7] exclude tests dirs via regexp --- .../codesearch/core/config/CindexConfig.scala | 1 + .../core/config/SourcesFilesConfig.scala | 4 +-- .../core/search/SearchRequest.scala | 3 +++ .../engine/csearch/CsearchProvider.scala | 27 ++++++++++--------- .../scala/codesearch/core/syntax/path.scala | 1 + 5 files changed, 22 insertions(+), 14 deletions(-) diff --git a/core/src/main/scala/codesearch/core/config/CindexConfig.scala b/core/src/main/scala/codesearch/core/config/CindexConfig.scala index 9d584e7..c6e5459 100644 --- a/core/src/main/scala/codesearch/core/config/CindexConfig.scala +++ b/core/src/main/scala/codesearch/core/config/CindexConfig.scala @@ -3,4 +3,5 @@ package codesearch.core.config case class CindexConfig( indexDir: String, tempIndexDir: String, + packagesToIndexFile: String ) diff --git a/core/src/main/scala/codesearch/core/config/SourcesFilesConfig.scala b/core/src/main/scala/codesearch/core/config/SourcesFilesConfig.scala index b313410..d1a5077 100644 --- a/core/src/main/scala/codesearch/core/config/SourcesFilesConfig.scala +++ b/core/src/main/scala/codesearch/core/config/SourcesFilesConfig.scala @@ -3,10 +3,10 @@ package codesearch.core.config case class SourcesFilesConfig( testDirsNames: Set[String], allowedFileNames: Set[String], - filesExtensions: FilesExtensions + filesExtensions: FilesExtensionsConfig ) -case class FilesExtensions( +case class FilesExtensionsConfig( commonExtensions: Set[String], sourceExtensions: Set[String], ) { def extensions: Set[String] = commonExtensions ++ sourceExtensions } diff --git a/core/src/main/scala/codesearch/core/search/SearchRequest.scala b/core/src/main/scala/codesearch/core/search/SearchRequest.scala index 5669c3d..ba4e073 100644 --- a/core/src/main/scala/codesearch/core/search/SearchRequest.scala +++ b/core/src/main/scala/codesearch/core/search/SearchRequest.scala @@ -21,6 +21,7 @@ case class SearchRequest( spaceInsensitive: Boolean, preciseMatch: Boolean, sourcesOnly: Boolean, + excludeTests: Boolean, page: Int, limit: Int ) { @@ -47,6 +48,7 @@ object SearchRequest { spaceInsensitive: String, preciseMatch: String, sourcesOnly: String, + excludeTests: String, page: String, limit: String ): SearchRequest = { @@ -59,6 +61,7 @@ object SearchRequest { isEnabled(spaceInsensitive), isEnabled(preciseMatch), isEnabled(sourcesOnly), + isEnabled(excludeTests), page.toInt, limit.toInt ) diff --git a/core/src/main/scala/codesearch/core/search/engine/csearch/CsearchProvider.scala b/core/src/main/scala/codesearch/core/search/engine/csearch/CsearchProvider.scala index 350e017..902355e 100644 --- a/core/src/main/scala/codesearch/core/search/engine/csearch/CsearchProvider.scala +++ b/core/src/main/scala/codesearch/core/search/engine/csearch/CsearchProvider.scala @@ -4,8 +4,7 @@ import cats.effect.Sync import cats.syntax.flatMap._ import cats.syntax.functor._ import cats.syntax.applicative._ -import codesearch.core.index.directory.СindexDirectory -import codesearch.core.index.repository.Extensions +import codesearch.core.config.{CindexConfig, SourcesFilesConfig} import codesearch.core.regex.RegexConstructor import codesearch.core.search.SearchRequest import codesearch.core.search.engine.SearchProvider @@ -18,12 +17,12 @@ case class MatchedRow(path: String, lineNumber: Int) object CsearchProvider { def apply[F[_]: Sync]( - cindex: СindexDirectory, - extensions: Extensions, + sourcesFilesConfig: SourcesFilesConfig, + cindexConfig: CindexConfig, logger: Logger[F] ): SearchProvider[F, SearchRequest, Stream[F, MatchedRow]] = (request: SearchRequest) => { - val indexDir = cindex.indexDirAs[String] + val indexDir = cindexConfig.indexDir val environment = ("CSEARCHINDEX", indexDir) val pipe = Seq("head", s"-${request.limit}") val process = Process(arguments(request), None, environment) #| pipe @@ -36,21 +35,25 @@ object CsearchProvider { } def arguments(request: SearchRequest): Seq[String] = { - val forExtensions: String = request.filePath match { + val searchInFilesRegexp: String = request.filePath match { case Some(filePath) => filePath case None => if (request.sourcesOnly) { - //val testDirsRegexp = ".*(!(\\/test|\\/spec|\\/tests))" - val sourcesExtensionsRegexp = extensions.sourceExtensions.mkString(".*\\.(", "|", ")$") - sourcesExtensionsRegexp + val sourcesExtensionsRegexp = + sourcesFilesConfig.filesExtensions.sourceExtensions.mkString(".*\\.(", "|", ")$") + if (request.excludeTests) { + val excludedTestDirsRegexp = sourcesFilesConfig.testDirsNames.mkString("^(?!.*(", "|", "))") + excludedTestDirsRegexp + sourcesExtensionsRegexp + } else sourcesExtensionsRegexp } else ".*" } - val regex = RegexConstructor(request.query, request.insensitive, request.spaceInsensitive, request.preciseMatch) + val queryRegex = + RegexConstructor(request.query, request.insensitive, request.spaceInsensitive, request.preciseMatch) request.filter match { - case Some(filter) => Seq("csearch", "-n", "-f", forExtensions, regex, filter) - case None => Seq("csearch", "-n", "-f", forExtensions, regex) + case Some(filter) => Seq("csearch", "-n", "-f", searchInFilesRegexp, queryRegex, filter) + case None => Seq("csearch", "-n", "-f", searchInFilesRegexp, queryRegex) } } diff --git a/core/src/main/scala/codesearch/core/syntax/path.scala b/core/src/main/scala/codesearch/core/syntax/path.scala index f2a1436..4cb38eb 100644 --- a/core/src/main/scala/codesearch/core/syntax/path.scala +++ b/core/src/main/scala/codesearch/core/syntax/path.scala @@ -2,6 +2,7 @@ package codesearch.core.syntax import java.nio.file.{Path, Paths} object path { + implicit final def string2Path(pathString: String): Path = Paths.get(pathString) implicit final class RichNioPath(private val parent: Path) extends AnyVal { def /(child: Path): Path = Paths.get(parent.toFile.getPath, child.toFile.getPath) def /(child: String): Path = Paths.get(parent.toFile.getPath, child)