Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

extract-corpus.sh 1.3 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
  1. #!/usr/bin/env bash
  2. set -e
  3. DATASETS_ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." >/dev/null 2>&1 && pwd )"
  4. PREFIX='./java_projects'
  5. if [ -z "$1" ] || [ -z "$2" ] || [ -z "$3" ] || [ -z "$4" ] || [ -z "$5" ] || [ -z "$6" ]; then
  6. echo "Not all input params are spicified"
  7. exit 1
  8. fi
  9. archive_path="$DATASETS_ROOT_DIR/$1"
  10. train_projects_file_path="$DATASETS_ROOT_DIR/$2"
  11. valid_projects_file_path="$DATASETS_ROOT_DIR/$3"
  12. test_projects_file_path="$DATASETS_ROOT_DIR/$4"
  13. demo_file="$PREFIX/$5"
  14. output_folder="$DATASETS_ROOT_DIR/$6"
  15. if ! [ -d "$output_folder" ]; then
  16. mkdir "$output_folder"
  17. fi
  18. extract_project_bunch() {
  19. bunch="$1"
  20. projects_file_path="$2"
  21. echo "Extracting $bunch projects: "
  22. projects=$(< "$projects_file_path" tr '\n' ' ' | sed "s/ $//")
  23. echo "$projects"
  24. tar_parameter=$(echo " $projects" | sed "s~ ~ $PREFIX/~g")
  25. tar -xzf "$archive_path" -C "$output_folder" $tar_parameter
  26. mv "$output_folder/$PREFIX" "$output_folder/$bunch"
  27. }
  28. extract_project_bunch train "$train_projects_file_path"
  29. extract_project_bunch valid "$valid_projects_file_path"
  30. extract_project_bunch "test" "$test_projects_file_path"
  31. echo "Extracting demo file: $demo_file"
  32. tar -xzf "$archive_path" -C "$output_folder" "$demo_file"
  33. mv "$output_folder/$PREFIX" "$output_folder/demo"
  34. set +e
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...