Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cut a release branch for stable GPU runs WIP #522

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions .dockerignore
133 changes: 133 additions & 0 deletions constraints.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
absl-py==1.4.0
aqtp==0.6.1
array-record==0.5.0
astroid==3.1.0
astunparse==1.6.3
attrs==23.2.0
cachetools==5.3.3
certifi==2024.2.2
charset-normalizer==3.3.2
chex==0.1.85
click==8.1.7
cloud-tpu-diagnostics==0.1.5
cloudpickle==3.0.0
contextlib2==21.6.0
dill==0.3.8
dm-tree==0.1.8
etils==1.7.0
exceptiongroup==1.2.0
flatbuffers==24.3.7
flax==0.8.1
fsspec==2024.2.0
gast==0.4.0
google-api-core==2.17.1
google-auth==2.28.2
google-auth-oauthlib==1.0.0
google-cloud-core==2.4.1
google-cloud-storage==2.15.0
google-crc32c==1.5.0
google-pasta==0.2.0
google-resumable-media==2.7.0
googleapis-common-protos==1.63.0
grain-nightly==0.0.6
grpcio==1.62.1
gviz-api==1.10.0
h5py==3.10.0
idna==3.6
immutabledict==4.2.0
importlab==0.8.1
importlib_resources==6.3.0
iniconfig==2.0.0
isort==5.13.2
jax==0.4.25
jaxlib==0.4.25
jaxtyping==0.2.28
Jinja2==3.1.3
keras==2.13.1
libclang==16.0.6
libcst==1.2.0
Markdown==3.5.2
markdown-it-py==3.0.0
MarkupSafe==2.1.5
mccabe==0.7.0
mdurl==0.1.2
ml-collections==0.1.1
ml-dtypes==0.3.2
mlperf-logging==3.0.0
more-itertools==10.2.0
msgpack==1.0.8
msgspec==0.18.6
mypy-extensions==1.0.0
nest-asyncio==1.6.0
networkx==3.1
ninja==1.11.1.1
numpy==1.24.3
nvidia-cublas-cu12==12.4.2.65
nvidia-cuda-cupti-cu12==12.4.99
nvidia-cuda-nvcc-cu12==12.4.99
nvidia-cuda-nvrtc-cu12==12.4.99
nvidia-cuda-runtime-cu12==12.4.99
nvidia-cudnn-cu12==8.9.7.29
nvidia-cufft-cu12==11.2.0.44
nvidia-cusolver-cu12==11.6.0.99
nvidia-cusparse-cu12==12.3.0.142
nvidia-nccl-cu12==2.19.3
nvidia-nvjitlink-cu12==12.4.99
oauthlib==3.2.2
opt-einsum==3.3.0
optax==0.2.1
orbax-checkpoint==0.5.5
packaging==24.0
pandas==2.2.1
platformdirs==4.2.0
pluggy==1.4.0
promise==2.3
protobuf==3.20.3
psutil==5.9.8
pyasn1==0.5.1
pyasn1-modules==0.3.0
pycnite==2023.10.11
pydot==2.0.0
Pygments==2.17.2
pylint==3.1.0
pyparsing==3.1.2
pytest==8.1.1
python-dateutil==2.9.0.post0
pytype==2024.3.11
pytz==2024.1
PyYAML==6.0.1
requests==2.31.0
requests-oauthlib==1.4.0
rich==13.7.1
rsa==4.9
scipy==1.12.0
sentencepiece==0.1.97
six==1.16.0
tabulate==0.9.0
tensorboard==2.13.0
tensorboard-data-server==0.7.2
tensorboard_plugin_profile==2.15.1
tensorboardX==2.6.2.2
tensorflow==2.13.1
tensorflow-datasets==4.9.4
tensorflow-estimator==2.13.0
tensorflow-hub==0.16.1
tensorflow-io-gcs-filesystem==0.36.0
tensorflow-metadata==1.14.0
tensorflow-text==2.13.0
tensorstore==0.1.54
termcolor==2.4.0
tf-keras==2.15.0
toml==0.10.2
tomli==2.0.1
tomlkit==0.12.4
toolz==0.12.1
tqdm==4.66.2
typeguard==2.13.3
typing-inspect==0.9.0
typing_extensions==4.5.0
tzdata==2024.1
urllib3==2.2.1
Werkzeug==3.0.1
wrapt==1.16.0
zipp==3.18.0
6 changes: 6 additions & 0 deletions gke/gpu/llama/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
apiVersion: v2
name: llama70b
description: llama70b
type: application
version: 0.1.0
appVersion: "1.16.0"
131 changes: 131 additions & 0 deletions gke/gpu/llama/templates/llama_70b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
{{- $root := . -}}
apiVersion: jobset.x-k8s.io/v1alpha2
kind: JobSet
metadata:
name: llama70b-maxtext
labels:
xpk.google.com/workload: llama70b-maxtext
spec:
failurePolicy:
maxRestarts: 0
replicatedJobs:
- name: slice-job
replicas: 1
template:
spec:
parallelism: {{ $root.Values.workload.nodes }}
completions: {{ $root.Values.workload.nodes }}
backoffLimit: 0 # When any pod fails, the job is failed
template:
metadata:
labels:
xpk.google.com/workload: llama70b-maxtext
spec:
schedulerName: default-scheduler
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: Exists
- key: cloud.google.com/gke-nodepool
operator: In
values: [a3plus-multi-nic]
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-h100-80gb
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
terminationGracePeriodSeconds: 30
tolerations:
- operator: "Exists"
key: nvidia.com/gpu
volumes:
- name: nvidia-install-dir-host
hostPath:
path: /home/kubernetes/bin/nvidia/lib64
- name: shared-memory
emptyDir:
medium: "Memory"
sizeLimit: 1Gi
- name: workload-terminated-volume
emptyDir:
containers:
- name: fastrak-daemon
image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.3
imagePullPolicy: Always
command:
- "bash"
- "-c"
- |
set -ex; chmod 755 /fts/entrypoint_rxdm_container.sh; /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr &
while [ ! -e "/usr/share/maxtext/workload_terminated" ]; do sleep 10; echo "sleeping"; done
args:
- |
set -ex
chmod 755 /fts/entrypoint_rxdm_container.sh
/fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr
sleep 1000
securityContext:
privileged: true
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia/lib64
- name: workload-terminated-volume
mountPath: /usr/share/maxtext
env:
- name: LD_LIBRARY_PATH
value: /usr/local/nvidia/lib64
- name: maxtext-fastrak
image: "{{ $root.Values.workload.image }}"
imagePullPolicy: Always
securityContext:
privileged: true
ports:
- containerPort: 6002
env:
- name: REPLICATED_JOB_NAME
valueFrom:
fieldRef:
fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
- name: JOBSET_NAME
valueFrom:
fieldRef:
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
- name: JAX_COORDINATOR_ADDRESS
value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
- name: NNODES
value: "{{ $root.Values.workload.nodes }}"
- name: NODE_RANK
valueFrom:
fieldRef:
fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
- name: USE_GPUDIRECT
value: "fastrak"
- name: GPUS_PER_NODE
value: "8"
- name: JAX_COORDINATOR_PORT
value: "6002"
- name: LD_LIBRARY_PATH
value: /usr/local/nvidia/lib64

- name: COMMAND
value: "python MaxText/train.py MaxText/configs/base.yml hardware=gpu run_name=2024-03-07-20-59 steps={{ $root.Values.workload.steps }} per_device_batch_size={{ $root.Values.workload.per_device_batch_size }} model_name={{ $root.Values.workload.model_name }} enable_checkpointing=false attention=dot_product dataset_type=synthetic async_checkpointing=false"
- name: XLA_FLAGS
value: {{ $root.Values.workload.xla_flags }}
command:
- "bash"
- "-c"
- |
echo XPK Start: $(date) ; _sigterm() ( kill -SIGTERM $!;); trap _sigterm SIGTERM; (cd /deps && bash gpu_multi_process_run.sh) & PID=$!; while kill -0 $PID 2>/dev/null; do sleep 5; done; EXIT_CODE=$? ; echo XPK End: $(date); echo EXIT_CODE=$EXIT_CODE; echo Main app is done > /usr/share/maxtext/workload_terminated
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia/lib64
- name: shared-memory
mountPath: /dev/shm
- name: workload-terminated-volume
mountPath: /usr/share/maxtext
resources:
limits:
nvidia.com/gpu: 8
7 changes: 7 additions & 0 deletions gke/gpu/llama/values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
workload:
nodes: 16
image: "us-central1-docker.pkg.dev/gce-ai-infra/maxtext/maxtext_base_image:03_14_2024_release"
steps: 30
per_device_batch_size: 12
model_name: llama2-7b
xla_flags: "--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_triton_gemm=false --xla_gpu_simplify_all_fp_conversions --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_all_reduce_combine_threshold_bytes=8589934592 --xla_gpu_all_gather_combine_threshold_bytes=8589934592 --xla_gpu_reduce_scatter_combine_threshold_bytes=8589934592 --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_disable_hlo_passes=rematerialization --xla_gpu_enable_async_collective_permute=true --xla_gpu_enable_async_all_to_all=true"
24 changes: 0 additions & 24 deletions gke/gpu/maxtext_chart/Chart.yaml

This file was deleted.