Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

TokenizerWithNGram.scala 1.6 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
  1. import com.johnsnowlabs.nlp.annotator._
  2. import com.johnsnowlabs.nlp.base._
  3. import com.johnsnowlabs.util.Benchmark
  4. import org.apache.spark.ml.Pipeline
  5. import org.apache.spark.ml.feature.NGram
  6. import org.apache.spark.sql.SparkSession
  7. object TokenizerWithNGram extends App {
  8. val spark: SparkSession = SparkSession
  9. .builder()
  10. .appName("test")
  11. .master("local[*]")
  12. .config("spark.driver.memory", "12G")
  13. .config("spark.kryoserializer.buffer.max","200M")
  14. .config("spark.serializer","org.apache.spark.serializer.KryoSerializer")
  15. .getOrCreate()
  16. import spark.implicits._
  17. spark.sparkContext.setLogLevel("WARN")
  18. val document = new DocumentAssembler()
  19. .setInputCol("text")
  20. .setOutputCol("document")
  21. val token = new Tokenizer()
  22. .setInputCols("document")
  23. .setOutputCol("token")
  24. val normalizer = new Normalizer()
  25. .setInputCols("token")
  26. .setOutputCol("normal")
  27. val finisher = new Finisher()
  28. .setInputCols("normal")
  29. val ngram = new NGram()
  30. .setN(3)
  31. .setInputCol("finished_normal")
  32. .setOutputCol("3-gram")
  33. val gramAssembler = new DocumentAssembler()
  34. .setInputCol("3-gram")
  35. .setOutputCol("3-grams")
  36. val pipeline = new Pipeline().setStages(Array(document, token, normalizer, finisher, ngram, gramAssembler))
  37. val testing = Seq(
  38. (1, "Google is a famous company"),
  39. (2, "Peter Parker is a super heroe")
  40. ).toDS.toDF( "_id", "text")
  41. val result = pipeline.fit(Seq.empty[String].toDS.toDF("text")).transform(testing)
  42. Benchmark.time("Time to convert and show") {result.show(truncate=false)}
  43. }
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...