google · chajath · Feb 28, 2024 · Feb 28, 2024 · Feb 29, 2024 · Mar 13, 2024
@@ -0,0 +1 @@
+.gitignore
@@ -0,0 +1,133 @@
+absl-py==1.4.0
+aqtp==0.6.1
+array-record==0.5.0
+astroid==3.1.0
+astunparse==1.6.3
+attrs==23.2.0
+cachetools==5.3.3
+certifi==2024.2.2
+charset-normalizer==3.3.2
+chex==0.1.85
+click==8.1.7
+cloud-tpu-diagnostics==0.1.5
+cloudpickle==3.0.0
+contextlib2==21.6.0
+dill==0.3.8
+dm-tree==0.1.8
+etils==1.7.0
+exceptiongroup==1.2.0
+flatbuffers==24.3.7
+flax==0.8.1
+fsspec==2024.2.0
+gast==0.4.0
+google-api-core==2.17.1
+google-auth==2.28.2
+google-auth-oauthlib==1.0.0
+google-cloud-core==2.4.1
+google-cloud-storage==2.15.0
+google-crc32c==1.5.0
+google-pasta==0.2.0
+google-resumable-media==2.7.0
+googleapis-common-protos==1.63.0
+grain-nightly==0.0.6
+grpcio==1.62.1
+gviz-api==1.10.0
+h5py==3.10.0
+idna==3.6
+immutabledict==4.2.0
+importlab==0.8.1
+importlib_resources==6.3.0
+iniconfig==2.0.0
+isort==5.13.2
+jax==0.4.25
+jaxlib==0.4.25
+jaxtyping==0.2.28
+Jinja2==3.1.3
+keras==2.13.1
+libclang==16.0.6
+libcst==1.2.0
+Markdown==3.5.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mccabe==0.7.0
+mdurl==0.1.2
+ml-collections==0.1.1
+ml-dtypes==0.3.2
+mlperf-logging==3.0.0
+more-itertools==10.2.0
+msgpack==1.0.8
+msgspec==0.18.6
+mypy-extensions==1.0.0
+nest-asyncio==1.6.0
+networkx==3.1
+ninja==1.11.1.1
+numpy==1.24.3
+nvidia-cublas-cu12==12.4.2.65
+nvidia-cuda-cupti-cu12==12.4.99
+nvidia-cuda-nvcc-cu12==12.4.99
+nvidia-cuda-nvrtc-cu12==12.4.99
+nvidia-cuda-runtime-cu12==12.4.99
+nvidia-cudnn-cu12==8.9.7.29
+nvidia-cufft-cu12==11.2.0.44
+nvidia-cusolver-cu12==11.6.0.99
+nvidia-cusparse-cu12==12.3.0.142
+nvidia-nccl-cu12==2.19.3
+nvidia-nvjitlink-cu12==12.4.99
+oauthlib==3.2.2
+opt-einsum==3.3.0
+optax==0.2.1
+orbax-checkpoint==0.5.5
+packaging==24.0
+pandas==2.2.1
+platformdirs==4.2.0
+pluggy==1.4.0
+promise==2.3
+protobuf==3.20.3
+psutil==5.9.8
+pyasn1==0.5.1
+pyasn1-modules==0.3.0
+pycnite==2023.10.11
+pydot==2.0.0
+Pygments==2.17.2
+pylint==3.1.0
+pyparsing==3.1.2
+pytest==8.1.1
+python-dateutil==2.9.0.post0
+pytype==2024.3.11
+pytz==2024.1
+PyYAML==6.0.1
+requests==2.31.0
+requests-oauthlib==1.4.0
+rich==13.7.1
+rsa==4.9
+scipy==1.12.0
+sentencepiece==0.1.97
+six==1.16.0
+tabulate==0.9.0
+tensorboard==2.13.0
+tensorboard-data-server==0.7.2
+tensorboard_plugin_profile==2.15.1
+tensorboardX==2.6.2.2
+tensorflow==2.13.1
+tensorflow-datasets==4.9.4
+tensorflow-estimator==2.13.0
+tensorflow-hub==0.16.1
+tensorflow-io-gcs-filesystem==0.36.0
+tensorflow-metadata==1.14.0
+tensorflow-text==2.13.0
+tensorstore==0.1.54
+termcolor==2.4.0
+tf-keras==2.15.0
+toml==0.10.2
+tomli==2.0.1
+tomlkit==0.12.4
+toolz==0.12.1
+tqdm==4.66.2
+typeguard==2.13.3
+typing-inspect==0.9.0
+typing_extensions==4.5.0
+tzdata==2024.1
+urllib3==2.2.1
+Werkzeug==3.0.1
+wrapt==1.16.0
+zipp==3.18.0
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: llama70b
+description: llama70b
+type: application
+version: 0.1.0
+appVersion: "1.16.0"
@@ -0,0 +1,131 @@
+{{- $root := . -}}
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+  name: llama70b-maxtext
+  labels:
+    xpk.google.com/workload: llama70b-maxtext
+spec:
+  failurePolicy:
+    maxRestarts: 0
+  replicatedJobs:
+    - name: slice-job
+      replicas: 1
+      template:
+        spec:
+          parallelism: {{ $root.Values.workload.nodes }}
+          completions: {{ $root.Values.workload.nodes }}
+          backoffLimit: 0   # When any pod fails, the job is failed
+          template:
+            metadata:
+              labels:
+                xpk.google.com/workload: llama70b-maxtext
+            spec:
+              schedulerName: default-scheduler
+              restartPolicy: Never
+              affinity:
+                nodeAffinity:
+                  requiredDuringSchedulingIgnoredDuringExecution:
+                    nodeSelectorTerms:
+                    - matchExpressions:
+                      - key: cloud.google.com/gke-accelerator
+                        operator: Exists
+                      - key: cloud.google.com/gke-nodepool
+                        operator: In
+                        values: [a3plus-multi-nic]
+              nodeSelector:
+                cloud.google.com/gke-accelerator: nvidia-h100-80gb
+              hostNetwork: true
+              dnsPolicy: ClusterFirstWithHostNet
+              terminationGracePeriodSeconds: 30
+              tolerations:
+              - operator: "Exists"
+                key: nvidia.com/gpu
+              volumes:
+              - name: nvidia-install-dir-host
+                hostPath:
+                  path: /home/kubernetes/bin/nvidia/lib64
+              - name: shared-memory
+                emptyDir:
+                  medium: "Memory"
+                  sizeLimit: 1Gi
+              - name: workload-terminated-volume
+                emptyDir:
+              containers:
+              - name: fastrak-daemon
+                image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.3
+                imagePullPolicy: Always
+                command: 
+                - "bash"
+                - "-c"
+                - |
+                  set -ex; chmod 755 /fts/entrypoint_rxdm_container.sh; /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr &
+                  while [ ! -e "/usr/share/maxtext/workload_terminated" ]; do sleep 10; echo "sleeping"; done
+                args:
+                  - |
+                    set -ex
+                    chmod 755 /fts/entrypoint_rxdm_container.sh
+                    /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr
+                    sleep 1000
+                securityContext:
+                  privileged: true
+                volumeMounts:
+                - name: nvidia-install-dir-host
+                  mountPath: /usr/local/nvidia/lib64
+                - name: workload-terminated-volume
+                  mountPath: /usr/share/maxtext
+                env:
+                - name: LD_LIBRARY_PATH
+                  value: /usr/local/nvidia/lib64
+              - name: maxtext-fastrak
+                image: "{{ $root.Values.workload.image }}"
+                imagePullPolicy: Always
+                securityContext:
+                  privileged: true
+                ports:
+                    - containerPort: 6002
+                env:
+                  - name: REPLICATED_JOB_NAME
+                    valueFrom:
+                      fieldRef:
+                        fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
+                  - name: JOBSET_NAME
+                    valueFrom:
+                      fieldRef:
+                        fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
+                  - name: JAX_COORDINATOR_ADDRESS
+                    value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
+                  - name: NNODES
+                    value: "{{ $root.Values.workload.nodes }}"
+                  - name: NODE_RANK
+                    valueFrom:
+                      fieldRef:
+                        fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
+                  - name: USE_GPUDIRECT
+                    value: "fastrak"
+                  - name: GPUS_PER_NODE
+                    value: "8"
+                  - name: JAX_COORDINATOR_PORT
+                    value: "6002"
+                  - name: LD_LIBRARY_PATH
+                    value: /usr/local/nvidia/lib64
+
+                  - name: COMMAND
+                    value: "python MaxText/train.py MaxText/configs/base.yml hardware=gpu run_name=2024-03-07-20-59 steps={{ $root.Values.workload.steps }} per_device_batch_size={{ $root.Values.workload.per_device_batch_size }} model_name={{ $root.Values.workload.model_name }} enable_checkpointing=false attention=dot_product dataset_type=synthetic async_checkpointing=false"
+                  - name: XLA_FLAGS
+                    value: {{ $root.Values.workload.xla_flags }}
+                command:
+                  - "bash"
+                  - "-c"
+                  - |
+                    echo XPK Start: $(date) ; _sigterm() ( kill -SIGTERM $!;); trap _sigterm SIGTERM; (cd /deps && bash gpu_multi_process_run.sh) & PID=$!; while kill -0 $PID 2>/dev/null; do sleep 5; done; EXIT_CODE=$? ; echo XPK End: $(date); echo EXIT_CODE=$EXIT_CODE; echo Main app is done > /usr/share/maxtext/workload_terminated
+                volumeMounts:
+                  - name: nvidia-install-dir-host
+                    mountPath: /usr/local/nvidia/lib64
+                  - name: shared-memory
+                    mountPath: /dev/shm
+                  - name: workload-terminated-volume
+                    mountPath: /usr/share/maxtext
+                resources:
+                  limits:
+                    nvidia.com/gpu: 8
@@ -0,0 +1,7 @@
+workload:
+  nodes: 16
+  image: "us-central1-docker.pkg.dev/gce-ai-infra/maxtext/maxtext_base_image:03_14_2024_release"
+  steps: 30
+  per_device_batch_size: 12
+  model_name: llama2-7b
+  xla_flags: "--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_triton_gemm=false --xla_gpu_simplify_all_fp_conversions --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_all_reduce_combine_threshold_bytes=8589934592 --xla_gpu_all_gather_combine_threshold_bytes=8589934592 --xla_gpu_reduce_scatter_combine_threshold_bytes=8589934592 --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true --xla_gpu_enable_while_loop_double_buffering=true --xla_gpu_enable_triton_softmax_fusion=false --xla_gpu_enable_all_gather_combine_by_dim=false --xla_gpu_enable_reduce_scatter_combine_by_dim=false --xla_disable_hlo_passes=rematerialization --xla_gpu_enable_async_collective_permute=true --xla_gpu_enable_async_all_to_all=true"