New options for preference tuning: rpo alpha, logprobs normalization, reference-free, simpo gamma (#327)

timofeev1995 · web-flow · commit 5151fd583dbb · 2025-06-16T10:03:55.000+02:00
* Add dpo improvements arguments

* Version bump (tmp, dev)

* Implicit setting of `reference_free` in case if simpo_gamma is set

* Fix unbound variable

* Fix

* Force normalization for simpo

* Version bump

* Formatting

* Version fix

* Remove reference-free from dpo

* Review fixes

* Formatting

* Fixes
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api"
 
 [tool.poetry]
 name = "together"
-version = "1.5.13"
+version = "1.5.14"
 authors = ["Together AI <support@together.ai>"]
 description = "Python client for Together's Cloud Platform!"
 readme = "README.md"
diff --git a/src/together/cli/api/finetune.py b/src/together/cli/api/finetune.py
@@ -142,6 +142,30 @@ def fine_tuning(ctx: click.Context) -> None:
     default=0.1,
     help="Beta parameter for DPO training (only used when '--training-method' is 'dpo')",
 )
+@click.option(
+    "--dpo-normalize-logratios-by-length",
+    type=bool,
+    default=False,
+    help=(
+        "Whether to normalize logratios by sample length "
+        "(only used when '--training-method' is 'dpo')"
+    ),
+)
+@click.option(
+    "--rpo-alpha",
+    type=float,
+    default=0.0,
+    help=(
+        "RPO alpha parameter of DPO training to include NLL in the loss "
+        "(only used when '--training-method' is 'dpo')"
+    ),
+)
+@click.option(
+    "--simpo-gamma",
+    type=float,
+    default=0.1,
+    help="SimPO gamma parameter (only used when '--training-method' is 'dpo')",
+)
 @click.option(
     "--suffix",
     "-s",
@@ -206,6 +230,9 @@ def create(
     train_on_inputs: bool | Literal["auto"],
     training_method: str,
     dpo_beta: float,
+    dpo_normalize_logratios_by_length: bool,
+    rpo_alpha: float,
+    simpo_gamma: float,
     from_checkpoint: str,
 ) -> None:
     """Start fine-tuning"""
@@ -239,6 +266,9 @@ def create(
         train_on_inputs=train_on_inputs,
         training_method=training_method,
         dpo_beta=dpo_beta,
+        dpo_normalize_logratios_by_length=dpo_normalize_logratios_by_length,
+        rpo_alpha=rpo_alpha,
+        simpo_gamma=simpo_gamma,
         from_checkpoint=from_checkpoint,
     )
 
diff --git a/src/together/resources/finetune.py b/src/together/resources/finetune.py
@@ -72,6 +72,9 @@ def create_finetune_request(
     train_on_inputs: bool | Literal["auto"] | None = None,
     training_method: str = "sft",
     dpo_beta: float | None = None,
+    dpo_normalize_logratios_by_length: bool = False,
+    rpo_alpha: float | None = None,
+    simpo_gamma: float | None = None,
     from_checkpoint: str | None = None,
 ) -> FinetuneRequest:
     if model is not None and from_checkpoint is not None:
@@ -182,6 +185,21 @@ def create_finetune_request(
 
     if dpo_beta is not None and training_method != "dpo":
         raise ValueError("dpo_beta is only supported for DPO training")
+    if dpo_normalize_logratios_by_length and training_method != "dpo":
+        raise ValueError(
+            "dpo_normalize_logratios_by_length=True is only supported for DPO training"
+        )
+    if rpo_alpha is not None:
+        if training_method != "dpo":
+            raise ValueError("rpo_alpha is only supported for DPO training")
+        if not rpo_alpha >= 0.0:
+            raise ValueError(f"rpo_alpha should be non-negative (got {rpo_alpha})")
+
+    if simpo_gamma is not None:
+        if training_method != "dpo":
+            raise ValueError("simpo_gamma is only supported for DPO training")
+        if not simpo_gamma >= 0.0:
+            raise ValueError(f"simpo_gamma should be non-negative (got {simpo_gamma})")
 
     lr_scheduler: FinetuneLRScheduler
     if lr_scheduler_type == "cosine":
@@ -204,7 +222,24 @@ def create_finetune_request(
     if training_method == "sft":
         training_method_cls = TrainingMethodSFT(train_on_inputs=train_on_inputs)
     elif training_method == "dpo":
-        training_method_cls = TrainingMethodDPO(dpo_beta=dpo_beta)
+        if simpo_gamma is not None and simpo_gamma > 0:
+            dpo_reference_free = True
+            dpo_normalize_logratios_by_length = True
+            rprint(
+                f"Parameter simpo_gamma was set to {simpo_gamma}. "
+                "SimPO training detected. Reference logits will not be used "
+                "and length normalization of log-probabilities will be enabled."
+            )
+        else:
+            dpo_reference_free = False
+
+        training_method_cls = TrainingMethodDPO(
+            dpo_beta=dpo_beta,
+            dpo_normalize_logratios_by_length=dpo_normalize_logratios_by_length,
+            dpo_reference_free=dpo_reference_free,
+            rpo_alpha=rpo_alpha,
+            simpo_gamma=simpo_gamma,
+        )
 
     finetune_request = FinetuneRequest(
         model=model,
@@ -302,6 +337,9 @@ def create(
         train_on_inputs: bool | Literal["auto"] | None = None,
         training_method: str = "sft",
         dpo_beta: float | None = None,
+        dpo_normalize_logratios_by_length: bool = False,
+        rpo_alpha: float | None = None,
+        simpo_gamma: float | None = None,
         from_checkpoint: str | None = None,
     ) -> FinetuneResponse:
         """
@@ -353,6 +391,9 @@ def create(
             training_method (str, optional): Training method. Defaults to "sft".
                 Supported methods: "sft", "dpo".
             dpo_beta (float, optional): DPO beta parameter. Defaults to None.
+            dpo_normalize_logratios_by_length (bool): Whether or not normalize logratios by sample length. Defaults to False,
+            rpo_alpha (float, optional): RPO alpha parameter of DPO training to include NLL in the loss. Defaults to None.
+            simpo_gamma: (float, optional): SimPO gamma parameter. Defaults to None.
             from_checkpoint (str, optional): The checkpoint identifier to continue training from a previous fine-tuning job.
                 The format: {$JOB_ID/$OUTPUT_MODEL_NAME}:{$STEP}.
                 The step value is optional, without it the final checkpoint will be used.
@@ -405,6 +446,9 @@ def create(
             train_on_inputs=train_on_inputs,
             training_method=training_method,
             dpo_beta=dpo_beta,
+            dpo_normalize_logratios_by_length=dpo_normalize_logratios_by_length,
+            rpo_alpha=rpo_alpha,
+            simpo_gamma=simpo_gamma,
             from_checkpoint=from_checkpoint,
         )
 
@@ -714,6 +758,9 @@ async def create(
         train_on_inputs: bool | Literal["auto"] | None = None,
         training_method: str = "sft",
         dpo_beta: float | None = None,
+        dpo_normalize_logratios_by_length: bool = False,
+        rpo_alpha: float | None = None,
+        simpo_gamma: float | None = None,
         from_checkpoint: str | None = None,
     ) -> FinetuneResponse:
         """
@@ -765,6 +812,9 @@ async def create(
             training_method (str, optional): Training method. Defaults to "sft".
                 Supported methods: "sft", "dpo".
             dpo_beta (float, optional): DPO beta parameter. Defaults to None.
+            dpo_normalize_logratios_by_length (bool): Whether or not normalize logratios by sample length. Defaults to False,
+            rpo_alpha (float, optional): RPO alpha parameter of DPO training to include NLL in the loss. Defaults to None.
+            simpo_gamma: (float, optional): SimPO gamma parameter. Defaults to None.
             from_checkpoint (str, optional): The checkpoint identifier to continue training from a previous fine-tuning job.
                 The format: {$JOB_ID/$OUTPUT_MODEL_NAME}:{$STEP}.
                 The step value is optional, without it the final checkpoint will be used.
@@ -817,6 +867,9 @@ async def create(
             train_on_inputs=train_on_inputs,
             training_method=training_method,
             dpo_beta=dpo_beta,
+            dpo_normalize_logratios_by_length=dpo_normalize_logratios_by_length,
+            rpo_alpha=rpo_alpha,
+            simpo_gamma=simpo_gamma,
             from_checkpoint=from_checkpoint,
         )
 
diff --git a/src/together/types/finetune.py b/src/together/types/finetune.py
@@ -159,6 +159,10 @@ class TrainingMethodDPO(TrainingMethod):
 
     method: Literal["dpo"] = "dpo"
     dpo_beta: float | None = None
+    dpo_normalize_logratios_by_length: bool = False
+    dpo_reference_free: bool = False
+    rpo_alpha: float | None = None
+    simpo_gamma: float | None = None
 
 
 class FinetuneRequest(BaseModel):