Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

corpus-stats.sh 1.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
  1. #!/usr/bin/env bash
  2. set -euo pipefail
  3. DATASETS_ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." >/dev/null 2>&1 && pwd )"
  4. CORPUS_DIR="$DATASETS_ROOT_DIR/$1"
  5. OUTPUT_FILE="$DATASETS_ROOT_DIR/$2"
  6. calc_metadata() {
  7. local _corpus
  8. _corpus="$1"
  9. local _size
  10. _size=$(du -sm "$_corpus" | awk '{print $1}' )
  11. local _files
  12. _files=$(find "$_corpus" -type f | wc -l)
  13. local _loc
  14. _loc=$(find "$_corpus" -type f -exec wc -l {} + | awk '/total/{print $1}' | paste -sd+ /dev/stdin | bc)
  15. local _json_string
  16. _json_string=$(jq -n \
  17. --argjson size_mb "$_size" \
  18. --argjson files "$_files" \
  19. --argjson loc "$_loc" \
  20. '{size_mb: $size_mb, files: $files, loc: $loc}' )
  21. echo "$_json_string"
  22. }
  23. TRAIN_STATS=$(calc_metadata "$CORPUS_DIR/train")
  24. VALID_STATS=$(calc_metadata "$CORPUS_DIR/valid")
  25. TEST_STATS=$(calc_metadata "$CORPUS_DIR/test")
  26. JSON_STRING=$(jq -n \
  27. --argjson train_stats "$TRAIN_STATS" \
  28. --argjson valid_stats "$VALID_STATS" \
  29. --argjson test_stats "$TEST_STATS" \
  30. '{train: $train_stats, valid: $valid_stats, test: $test_stats}' )
  31. echo "$CORPUS_DIR.metadata"
  32. echo "$JSON_STRING" > "$OUTPUT_FILE"
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...