Skip to content

Commit 84ddd14

Browse files
azahed98Arsh Zahedmryab
authored
ENG 1730: Support validation files for fine-tuning jobs (#161)
* Add n_evals and validation_file * Add eval complete, validation file arg * Add eval complete, validation file arg * Change to "File ID" Co-authored-by: Max Ryabinin <[email protected]> * Change to "File ID" 2 Co-authored-by: Max Ryabinin <[email protected]> * Change to log warn * Update pyproject version * Undo lock commit --------- Co-authored-by: Arsh Zahed <[email protected]> Co-authored-by: Max Ryabinin <[email protected]>
1 parent 8c1073d commit 84ddd14

File tree

3 files changed

+41
-0
lines changed

3 files changed

+41
-0
lines changed

src/together/cli/api/finetune.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ def fine_tuning(ctx: click.Context) -> None:
2323
)
2424
@click.option("--model", type=str, required=True, help="Base model name")
2525
@click.option("--n-epochs", type=int, default=1, help="Number of epochs to train for")
26+
@click.option(
27+
"--validation-file", type=str, default="", help="Validation file ID from Files API"
28+
)
29+
@click.option("--n-evals", type=int, default=0, help="Number of evaluation loops")
2630
@click.option(
2731
"--n-checkpoints", type=int, default=1, help="Number of checkpoints to save"
2832
)
@@ -50,8 +54,10 @@ def fine_tuning(ctx: click.Context) -> None:
5054
def create(
5155
ctx: click.Context,
5256
training_file: str,
57+
validation_file: str,
5358
model: str,
5459
n_epochs: int,
60+
n_evals: int,
5561
n_checkpoints: int,
5662
batch_size: int,
5763
learning_rate: float,
@@ -80,11 +86,21 @@ def create(
8086
f"You set LoRA parameter `{param}` for a full fine-tuning job. "
8187
f"Please change the job type with --lora or remove `{param}` from the arguments"
8288
)
89+
if n_evals <= 0 and validation_file:
90+
log_warn(
91+
"Warning: You have specified a validation file but the number of evaluation loops is set to 0. No evaluations will be performed."
92+
)
93+
elif n_evals > 0 and not validation_file:
94+
raise click.BadParameter(
95+
"You have specified a number of evaluation loops but no validation file."
96+
)
8397

8498
response = client.fine_tuning.create(
8599
training_file=training_file,
86100
model=model,
87101
n_epochs=n_epochs,
102+
validation_file=validation_file,
103+
n_evals=n_evals,
88104
n_checkpoints=n_checkpoints,
89105
batch_size=batch_size,
90106
learning_rate=learning_rate,

src/together/resources/finetune.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ def create(
3030
training_file: str,
3131
model: str,
3232
n_epochs: int = 1,
33+
validation_file: str | None = "",
34+
n_evals: int | None = 0,
3335
n_checkpoints: int | None = 1,
3436
batch_size: int | None = 16,
3537
learning_rate: float | None = 0.00001,
@@ -48,6 +50,8 @@ def create(
4850
training_file (str): File-ID of a file uploaded to the Together API
4951
model (str): Name of the base model to run fine-tune job on
5052
n_epochs (int, optional): Number of epochs for fine-tuning. Defaults to 1.
53+
validation file (str, optional): File ID of a file uploaded to the Together API for validation.
54+
n_evals (int, optional): Number of evaluation loops to run. Defaults to 0.
5155
n_checkpoints (int, optional): Number of checkpoints to save during fine-tuning.
5256
Defaults to 1.
5357
batch_size (int, optional): Batch size for fine-tuning. Defaults to 32.
@@ -83,7 +87,9 @@ def create(
8387
parameter_payload = FinetuneRequest(
8488
model=model,
8589
training_file=training_file,
90+
validation_file=validation_file,
8691
n_epochs=n_epochs,
92+
n_evals=n_evals,
8793
n_checkpoints=n_checkpoints,
8894
batch_size=batch_size,
8995
learning_rate=learning_rate,
@@ -275,6 +281,8 @@ async def create(
275281
training_file: str,
276282
model: str,
277283
n_epochs: int = 1,
284+
validation_file: str | None = "",
285+
n_evals: int = 0,
278286
n_checkpoints: int | None = 1,
279287
batch_size: int | None = 32,
280288
learning_rate: float = 0.00001,
@@ -288,6 +296,8 @@ async def create(
288296
training_file (str): File-ID of a file uploaded to the Together API
289297
model (str): Name of the base model to run fine-tune job on
290298
n_epochs (int, optional): Number of epochs for fine-tuning. Defaults to 1.
299+
validation file (str, optional): File ID of a file uploaded to the Together API for validation.
300+
n_evals (int, optional): Number of evaluation loops to run. Defaults to 0.
291301
n_checkpoints (int, optional): Number of checkpoints to save during fine-tuning.
292302
Defaults to 1.
293303
batch_size (int, optional): Batch size for fine-tuning. Defaults to 32.
@@ -309,7 +319,9 @@ async def create(
309319
parameter_payload = FinetuneRequest(
310320
model=model,
311321
training_file=training_file,
322+
validation_file=validation_file,
312323
n_epochs=n_epochs,
324+
n_evals=n_evals,
313325
n_checkpoints=n_checkpoints,
314326
batch_size=batch_size,
315327
learning_rate=learning_rate,

src/together/types/finetune.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ class FinetuneEventType(str, Enum):
6161
CHECKPOINT_SAVE = "CHECKPOINT_SAVE"
6262
BILLING_LIMIT = "BILLING_LIMIT"
6363
EPOCH_COMPLETE = "EPOCH_COMPLETE"
64+
EVAL_COMPLETE = "EVAL_COMPLETE"
6465
TRAINING_COMPLETE = "TRAINING_COMPLETE"
6566
MODEL_COMPRESSING = "COMPRESSING_MODEL"
6667
MODEL_COMPRESSION_COMPLETE = "MODEL_COMPRESSION_COMPLETE"
@@ -135,6 +136,8 @@ class FinetuneRequest(BaseModel):
135136

136137
# training file ID
137138
training_file: str
139+
# validation file id
140+
validation_file: str | None = None
138141
# base model string
139142
model: str
140143
# number of epochs to train for
@@ -143,6 +146,8 @@ class FinetuneRequest(BaseModel):
143146
learning_rate: float
144147
# number of checkpoints to save
145148
n_checkpoints: int | None = None
149+
# number of evaluation loops to run
150+
n_evals: int | None = None
146151
# training batch size
147152
batch_size: int | None = None
148153
# up to 40 character suffix for output model name
@@ -173,6 +178,8 @@ class FinetuneResponse(BaseModel):
173178
n_epochs: int | None = None
174179
# number of checkpoints to save
175180
n_checkpoints: int | None = None
181+
# number of evaluation loops
182+
n_evals: int | None = None
176183
# training batch size
177184
batch_size: int | None = None
178185
# training learning rate
@@ -196,8 +203,14 @@ class FinetuneResponse(BaseModel):
196203
param_count: int | None = None
197204
# fine-tune job price
198205
total_price: int | None = None
206+
# total number of training steps
207+
total_steps: int | None = None
208+
# number of steps completed (incrementing counter)
209+
steps_completed: int | None = None
199210
# number of epochs completed (incrementing counter)
200211
epochs_completed: int | None = None
212+
# number of evaluation loops completed (incrementing counter)
213+
evals_completed: int | None = None
201214
# place in job queue (decrementing counter)
202215
queue_depth: int | None = None
203216
# weights & biases project name

0 commit comments

Comments
 (0)