Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

megamolbart_pretrain_slurm.sh 3.9 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
  1. #!/bin/bash
  2. #SBATCH --nodes=8
  3. #SBATCH --ntasks-per-node=8
  4. #SBATCH --gpus-per-node=8 # n gpus per machine <required>
  5. #SBATCH --mail-type=FAIL
  6. #SBATCH --time=8:00:00
  7. #SBATCH --partition=batch_dgx1_m2
  8. #SBATCH --account=ent_aiapps_omics
  9. #SBATCH --job-name=bionemo
  10. #SBATCH --nv-meta=ml-model.megamolbart
  11. #SBATCH --mem=0 # all mem avail
  12. #SBATCH --overcommit
  13. #SBATCH --exclusive # exclusive node access
  14. # Copyright (c) 2022, NVIDIA CORPORATION.
  15. # SPDX-License-Identifier: Apache-2.0
  16. # Licensed under the Apache License, Version 2.0 (the "License");
  17. # you may not use this file except in compliance with the License.
  18. # You may obtain a copy of the License at
  19. #
  20. # http://www.apache.org/licenses/LICENSE-2.0
  21. #
  22. # Unless required by applicable law or agreed to in writing, software
  23. # distributed under the License is distributed on an "AS IS" BASIS,
  24. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25. # See the License for the specific language governing permissions and
  26. # limitations under the License.
  27. set -x
  28. while [[ $# -gt 0 ]]; do
  29. case $1 in
  30. -p|--prop)
  31. echo 'Overwriting values from $2.'
  32. PROPERTY_FILES=$2
  33. shift
  34. shift
  35. ;;
  36. *)
  37. echo 'Invalid input'
  38. exit 1
  39. ;;
  40. esac
  41. done
  42. # All variables with default values must be defined in this section
  43. #=========================
  44. MEGAMOLBART_CONT="nvcr.io#t6a4nuz8vrsr/megamolbart:0.2.0-ea3"
  45. STORAGE_ROOT=""
  46. WANDB_API_KEY=""
  47. MICRO_BATCH_SIZE=256 # Please check GPU mem size. 256 is recommended for A100 with 80 GB mem.
  48. VAL_CHECK_INTERVAL=200
  49. JOB_TYPE='nemo-chem'
  50. EXP_NAME_PREFIX='nemo_chem'
  51. #=========================
  52. set -e
  53. # Any value that needs to be overwritten should be defined in PROPERTY_FILES
  54. if [ ! -z "${PROPERTY_FILES}" ];
  55. then
  56. IFS=',' read -ra FILES <<< ${PROPERTY_FILES}
  57. for PROPERTY_FILE in "${FILES[@]}"; do
  58. source ${PROPERTY_FILE}
  59. done
  60. fi
  61. if [ -z "${STORAGE_ROOT}" ];
  62. then
  63. echo "STORAGE_ROOT is invaild. STORAGE_ROOT=${STORAGE_ROOT}. Please check the properties file."
  64. exit 1
  65. fi
  66. EXP_NAME=${EXP_NAME_PREFIX}_node_${SLURM_JOB_NUM_NODES}_inv${VAL_CHECK_INTERVAL}
  67. DATA_PATH="${STORAGE_ROOT}/data"
  68. RESULT_PATH="${STORAGE_ROOT}/results/${EXP_NAME}"
  69. MOUNTS="$DATA_PATH:/data,$RESULT_PATH:/result"
  70. mkdir -p ${RESULT_PATH}
  71. # This configuration assumes TP and PP is 1.
  72. GLOBAL_BATCH_SIZE=$(expr ${MICRO_BATCH_SIZE} \* ${SLURM_JOB_NUM_NODES} \* ${SLURM_NTASKS_PER_NODE})
  73. # NeMo and BioNeMo code is picked from the container. To use code from a shared
  74. # folder instead, please NEMO_CODE and BIONEMO_CODE in the properties file.
  75. if [ ! -z "${NEMO_CODE}" ];
  76. then
  77. MOUNTS="${MOUNTS},${NEMO_CODE}:/opt/nvidia/nemo"
  78. fi
  79. if [ ! -z "${BIONEMO_CODE}" ];
  80. then
  81. MOUNTS="${MOUNTS},${BIONEMO_CODE}:/opt/nvidia/nemo_chem"
  82. fi
  83. set -x
  84. srun \
  85. --output slurm-%j-%n.out \
  86. --error error-%j-%n.out \
  87. --container-image ${MEGAMOLBART_CONT} \
  88. --container-mounts ${MOUNTS} \
  89. --container-workdir /opt/nvidia/nemo_chem/examples/chem/ \
  90. --export WANDB_API_KEY="${WANDB_API_KEY}" \
  91. python megamolbart_pretrain.py \
  92. --config-path=conf \
  93. --config-name=megamolbart_pretrain_small_span_aug \
  94. ++exp_manager.wandb_logger_kwargs.job_type="${JOB_TYPE}" \
  95. ++exp_manager.wandb_logger_kwargs.name="${EXP_NAME}"\
  96. ++trainer.num_nodes=${SLURM_JOB_NUM_NODES} \
  97. ++trainer.gpus=${SLURM_NTASKS_PER_NODE} \
  98. ++trainer.val_check_interval=${VAL_CHECK_INTERVAL} \
  99. ++trainer.max_steps=20000000 \
  100. model.micro_batch_size=${MICRO_BATCH_SIZE} \
  101. model.global_batch_size=${GLOBAL_BATCH_SIZE} \
  102. model.tokenizer.model=/opt/nvidia/nemo_chem/models/vocab/megamolbart.model \
  103. model.tokenizer.vocab_file=/opt/nvidia/nemo_chem/models/vocab/megamolbart.vocab \
  104. model.data.links_file=/opt/nvidia/nemo_chem/examples/chem/conf/dataset/ZINC-downloader-test.txt \
  105. model.data.dataset.val=x000-small
  106. set +x
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...