Add 70B models support for fine tuning job submission (#28)

justusc · web-flow · commit 3514b5c26ad2 · 2023-09-13T09:43:32.000-07:00
* Revert "Remove 70B models (not yet supported) (#26)" This reverts commit 6b7ff20. * Add a check for n_checkpoints to ensure that we limit the number of checkpoints to 1 for 70B models * Change default behavior for batch size for 40B models. And only allow fixed batch size. * Format changes only * Address PR review * Remove duplicate code. * Black fixes
diff --git a/src/together/commands/finetune.py b/src/together/commands/finetune.py
@@ -77,7 +77,7 @@ def _add_create(parser: argparse._SubParsersAction[argparse.ArgumentParser]) ->
         "--batch-size",
         "-b",
         metavar="BATCH_SIZE",
-        default=32,
+        default=None,
         help="The batch size to use for training. Default=32",
         type=int,
     )
@@ -281,6 +281,16 @@ def _add_checkpoints(
 def _run_create(args: argparse.Namespace) -> None:
     finetune = Finetune()
 
+    # Set default batch size based on model
+    if args.batch_size is None:
+        if args.model in [
+            "togethercomputer/llama-2-70b",
+            "togethercomputer/llama-2-70b-chat",
+        ]:
+            args.batch_size = 144
+        else:
+            args.batch_size = 32
+
     response = finetune.create(
         training_file=args.training_file,  # training file_id
         # validation_file=args.validation_file,  # validation file_id
diff --git a/src/together/config.py b/src/together/config.py
@@ -14,8 +14,8 @@
     "togethercomputer/RedPajama-INCITE-Instruct-3B-v1",
     "togethercomputer/Pythia-Chat-Base-7B",
     "togethercomputer/Llama-2-7B-32K-Instruct",
-    # "togethercomputer/llama-2-70b",
-    # "togethercomputer/llama-2-70b-chat",
+    "togethercomputer/llama-2-70b",
+    "togethercomputer/llama-2-70b-chat",
 ]
 
 # List of models we support and their particular behavior, ie special tokens,
@@ -73,8 +73,8 @@
     "togethercomputer/falcon-7b": {},
     "togethercomputer/llama-2-13b-chat": {"bos_token": "<s>", "eos_token": "</s>"},
     "togethercomputer/llama-2-13b": {"bos_token": "<s>", "eos_token": "</s>"},
-    # "togethercomputer/llama-2-70b-chat": {"bos_token": "<s>", "eos_token": "</s>"},
-    # "togethercomputer/llama-2-70b": {"bos_token": "<s>", "eos_token": "</s>"},
+    "togethercomputer/llama-2-70b-chat": {"bos_token": "<s>", "eos_token": "</s>"},
+    "togethercomputer/llama-2-70b": {"bos_token": "<s>", "eos_token": "</s>"},
     "togethercomputer/llama-2-7b-chat": {"bos_token": "<s>", "eos_token": "</s>"},
     "togethercomputer/llama-2-7b": {"bos_token": "<s>", "eos_token": "</s>"},
     "togethercomputer/mpt-30b-chat": {},
diff --git a/src/together/finetune.py b/src/together/finetune.py
@@ -34,8 +34,8 @@ def model_param_count(name: str) -> int:
         "togethercomputer/CodeLlama-13b": 13016028160,
         "togethercomputer/CodeLlama-13b-Python": 13016028160,
         "togethercomputer/CodeLlama-13b-Instruct": 13016028160,
-        # "togethercomputer/llama-2-70b": 68976648192,
-        # "togethercomputer/llama-2-70b-chat": 68976648192,
+        "togethercomputer/llama-2-70b": 68976648192,
+        "togethercomputer/llama-2-70b-chat": 68976648192,
     }
     try:
         return pcount[name]
@@ -89,6 +89,24 @@ def create(
                 f"The number of checkpoints must be < the number of epochs, setting to {n_checkpoints}"
             )
 
+        if (
+            model
+            in ["togethercomputer/llama-2-70b", "togethercomputer/llama-2-70b-chat"]
+            and batch_size != 144
+        ):
+            raise ValueError(
+                f"Batch size must be 144 for {model} model. Please set batch size to 144"
+            )
+
+        # TODO: REMOVE THIS CHECK WHEN WE HAVE CHECKPOINTING WORKING FOR 70B models
+        if n_checkpoints > 1 and model in [
+            "togethercomputer/llama-2-70b",
+            "togethercomputer/llama-2-70b-chat",
+        ]:
+            raise ValueError(
+                "Saving checkpoints during training currently not supported for {model}.  Please set the number of checkpoints to 1"
+            )
+
         parameter_payload = {
             "training_file": training_file,
             # "validation_file": validation_file,