diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3818b44 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +# Project exclude paths +/project/project/target/ +/project/target/ +/target/ +/target/scala-2.11/classes/ \ No newline at end of file diff --git a/build.sbt b/build.sbt new file mode 100644 index 0000000..917f4b1 --- /dev/null +++ b/build.sbt @@ -0,0 +1,15 @@ +name := "core" + +organization := "io.nlytx" + +version := "0.1" + +scalaVersion := "2.11.12" + + +libraryDependencies ++= Seq( + "org.apache.spark" %% "spark-core" % "2.4.6", + "org.apache.spark" %% "spark-sql" % "2.4.6", + "org.apache.spark" %% "spark-mllib" % "2.4.6", + "com.johnsnowlabs.nlp" %% "spark-nlp" % "2.5.2" +) diff --git a/src/main/scala/io/nlytx/core/CoRE.scala b/src/main/scala/io/nlytx/core/CoRE.scala new file mode 100644 index 0000000..1a36e46 --- /dev/null +++ b/src/main/scala/io/nlytx/core/CoRE.scala @@ -0,0 +1,5 @@ +package io.nlytx.core + +object CoRE { + +} diff --git a/src/main/scala/io/nlytx/core/spark/File.scala b/src/main/scala/io/nlytx/core/spark/File.scala new file mode 100644 index 0000000..f9d7d42 --- /dev/null +++ b/src/main/scala/io/nlytx/core/spark/File.scala @@ -0,0 +1,26 @@ +package io.nlytx.core.spark + +import org.apache.spark.ml.PipelineModel +import org.apache.spark.sql.{DataFrame, SparkSession} + +case class File(basePath:String)(implicit val spark:SparkSession) { + + def loadDataFrame(fileName:String) = + spark.read.load(s"${basePath}/${fileName}.dataframe") + + def saveDataFrame(df:DataFrame,fileName:String,overwrite:Boolean = false) = { + val saveMode = if (overwrite) "overwrite" else "ErrorIfExists" + df.write.mode(saveMode) + .save(s"${basePath}/${fileName}.dataframe") + } + + def loadPipelineModel(fileName:String) = + PipelineModel.load(s"${basePath}/${fileName}.pipelinemodel") + + def savePipelineModel(model:PipelineModel, fileName:String, overwrite:Boolean = false) = { + val saveMode = if (overwrite) + model.write.overwrite.save(s"${basePath}/${fileName}.pipelinemodel") + else + model.write.save(s"${basePath}/${fileName}.pipelinemodel") + } +}