frequent_docwords.parquet data docword.txt data
Posted: Fri May 20, 2022 5:25 pm
frequent_docwords.parquet data
docword.txt data
1 pl 1 1 1 1 0 0 +-----+--- + word docIdcount|firstLetter| +----- + plane 1 10001 plane! 31 1001 pl car 21 5001 c! car 11 1201 cl motorbike! 10 12001 motorbike! 7021 ml motorbike 600 m| truck 31 1221 ti boat! 31 20001 b boat! 21 2001 b -+--- -+--- - + + 21 31
vocabld docld 3 3 count 600 702 2 3 1 2 120 2 5 NON 2 2 200 500 100 3 1 13 15 3 4 2000 122 1200 1000 1 3 1 1
import org.apache.spark.sql.. import org.apache.spark.sql.types. - import org.apache.spark.sql.functions.. import org.apache.spark.sql.expressions. - // Define case classes for input data case class Docword (docId: Int, vocabId: Int, count: Int) object Main { def solution (spark: SparkSession) { import spark.implicits.. // Read the input data val docwords = spark.read. schema(Encoders.product[Docword]. schema). option ("delimiter", " "). csv ("Assignment_Data/docword-small.txt"). as[Docword] val frequentDocwordsFilename = "Assignment_Data/frequent_docwords.parquet" // TODO: *** Put your solution here *** } // Do not edit the main function def main(args: Array[String]) { // Set log level import org.apache.log4j. {Logger, Level} Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level. WARN) // Initialise Spark val spark = Sparksession.builder .appName("Task4a") .master("local[4]") .config("spark.sql.shuffle partitions", 1) .getOrCreate() // Run solution code solution (spark) // Stop Spark spark.stop() } }
[spark SQL] Remove all rows which reference infrequently occurring words from docwords. Store the resulting dataframe in Parquet format at “../frequent_docwords.parquet” and in CSV format at “Task_4a-out". An infrequently occurring word is any word that appears less than 1000 times in the entire corpus of documents. For the small example input file the expected output is: 3,1,1200 3,2,702 3,3,600 5,3,2000 5,2,200 1,1,1000 1,3,100
docword.txt data
1 pl 1 1 1 1 0 0 +-----+--- + word docIdcount|firstLetter| +----- + plane 1 10001 plane! 31 1001 pl car 21 5001 c! car 11 1201 cl motorbike! 10 12001 motorbike! 7021 ml motorbike 600 m| truck 31 1221 ti boat! 31 20001 b boat! 21 2001 b -+--- -+--- - + + 21 31
vocabld docld 3 3 count 600 702 2 3 1 2 120 2 5 NON 2 2 200 500 100 3 1 13 15 3 4 2000 122 1200 1000 1 3 1 1
import org.apache.spark.sql.. import org.apache.spark.sql.types. - import org.apache.spark.sql.functions.. import org.apache.spark.sql.expressions. - // Define case classes for input data case class Docword (docId: Int, vocabId: Int, count: Int) object Main { def solution (spark: SparkSession) { import spark.implicits.. // Read the input data val docwords = spark.read. schema(Encoders.product[Docword]. schema). option ("delimiter", " "). csv ("Assignment_Data/docword-small.txt"). as[Docword] val frequentDocwordsFilename = "Assignment_Data/frequent_docwords.parquet" // TODO: *** Put your solution here *** } // Do not edit the main function def main(args: Array[String]) { // Set log level import org.apache.log4j. {Logger, Level} Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level. WARN) // Initialise Spark val spark = Sparksession.builder .appName("Task4a") .master("local[4]") .config("spark.sql.shuffle partitions", 1) .getOrCreate() // Run solution code solution (spark) // Stop Spark spark.stop() } }
[spark SQL] Remove all rows which reference infrequently occurring words from docwords. Store the resulting dataframe in Parquet format at “../frequent_docwords.parquet” and in CSV format at “Task_4a-out". An infrequently occurring word is any word that appears less than 1000 times in the entire corpus of documents. For the small example input file the expected output is: 3,1,1200 3,2,702 3,3,600 5,3,2000 5,2,200 1,1,1000 1,3,100