Skip to content

Commit

Permalink
Add tuning system (#36)
Browse files Browse the repository at this point in the history
Now only for completions
  • Loading branch information
vhaldemar authored Dec 6, 2024
1 parent 0c1ef1f commit 1eadd9d
Show file tree
Hide file tree
Showing 52 changed files with 2,289 additions and 84 deletions.
77 changes: 77 additions & 0 deletions examples/async/tuning/attach.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/usr/bin/env python3

from __future__ import annotations

import asyncio
import pathlib
import uuid

from yandex_cloud_ml_sdk import AsyncYCloudML


def local_path(path: str) -> pathlib.Path:
return pathlib.Path(__file__).parent / path


async def get_datasets(sdk):
"""
This function represents getting or creating datasets object.
In real life you could use just a datasets ids, for example:
```
dataset = await sdk.datasets.get("some_id")
tuning_task = await base_model.tune_deferred(
"dataset_id",
validation_datasets=dataset
)
```
"""

async for dataset in sdk.datasets.list(status="READY"):
print(f'using old dataset {dataset=}')
break
else:
print('no old datasets found, creating new one')
dataset_draft = sdk.datasets.completions.from_path_deferred(
path=local_path('example_dataset'),
upload_format='jsonlines',
name='foo',
)

operation = await dataset_draft.upload()
dataset = await operation
print(f'created new dataset {dataset=}')

return dataset, dataset


async def main() -> None:
sdk = AsyncYCloudML(folder_id='b1ghsjum2v37c2un8h64')
train_dataset, validation_dataset = await get_datasets(sdk)
base_model = sdk.models.completions('yandexgpt-lite')

tuning_task = await base_model.tune_deferred(
train_dataset,
validation_datasets=validation_dataset,
name=str(uuid.uuid4())
)
print(f'new {tuning_task=}')

try:
same_task = await base_model.attach_tune_deferred(tuning_task.id)
print(f'{same_task=}')

# IMPORTANT
# .get will raise NOT_FOUND first few seconds, before Yandex Cloud "Operation"
# will create a "TuningTask" at the backend.
await asyncio.sleep(5)

same_task2 = await sdk.tuning.get(tuning_task.id)
print(f'{same_task2=}')
finally:
await tuning_task.cancel()


if __name__ == '__main__':
asyncio.run(main())
78 changes: 78 additions & 0 deletions examples/async/tuning/basic_train_and_use.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/usr/bin/env python3

from __future__ import annotations

import asyncio
import pathlib
import uuid

from yandex_cloud_ml_sdk import AsyncYCloudML


def local_path(path: str) -> pathlib.Path:
return pathlib.Path(__file__).parent / path


async def get_datasets(sdk):
"""
This function represents getting or creating datasets object.
In real life you could use just a datasets ids, for example:
```
dataset = await sdk.datasets.get("some_id")
tuning_task = await base_model.tune_deferred(
"dataset_id",
validation_datasets=dataset
)
```
"""

async for dataset in sdk.datasets.list(status="READY"):
print(f'using old dataset {dataset=}')
break
else:
print('no old datasets found, creating new one')
dataset_draft = sdk.datasets.completions.from_path_deferred(
path=local_path('example_dataset'),
upload_format='jsonlines',
name='foo',
)

operation = await dataset_draft.upload()
dataset = await operation
print(f'created new dataset {dataset=}')

return dataset, dataset


async def main() -> None:
sdk = AsyncYCloudML(folder_id='b1ghsjum2v37c2un8h64')
train_dataset, validation_dataset = await get_datasets(sdk)
base_model = sdk.models.completions('yandexgpt-lite')

# `.tune(...)` is a shortcut for:
# tuning_task = await base_model.tune_deferred(...)
# new_model = await tuning_task.wait(...)
# But it gives you less control on tune canceling and
# reporting.
new_model = await base_model.tune(
train_dataset,
validation_datasets=validation_dataset,
name=str(uuid.uuid4())
)
print(f'resulting {new_model}')

completion_result = await new_model.run("hey!")
print(f'{completion_result=}')

# or save model.uri somewhere and reuse it later
tuned_uri = new_model.uri
model = sdk.models.completions(tuned_uri)

completion_result = await model.run("hey!")
print(f'{completion_result=}')


if __name__ == '__main__':
asyncio.run(main())
81 changes: 81 additions & 0 deletions examples/async/tuning/cancel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env python3

from __future__ import annotations

import asyncio
import pathlib
import uuid

from yandex_cloud_ml_sdk import AsyncYCloudML


def local_path(path: str) -> pathlib.Path:
return pathlib.Path(__file__).parent / path


async def get_datasets(sdk):
"""
This function represents getting or creating datasets object.
In real life you could use just a datasets ids, for example:
```
dataset = await sdk.datasets.get("some_id")
tuning_task = await base_model.tune_deferred(
"dataset_id",
validation_datasets=dataset
)
```
"""

async for dataset in sdk.datasets.list(status="READY"):
print(f'using old dataset {dataset=}')
break
else:
print('no old datasets found, creating new one')
dataset_draft = sdk.datasets.completions.from_path_deferred(
path=local_path('example_dataset'),
upload_format='jsonlines',
name='foo',
)

operation = await dataset_draft.upload()
dataset = await operation
print(f'created new dataset {dataset=}')

return dataset, dataset


async def main() -> None:
sdk = AsyncYCloudML(folder_id='b1ghsjum2v37c2un8h64')
train_dataset, validation_dataset = await get_datasets(sdk)
base_model = sdk.models.completions('yandexgpt-lite')

tuning_task = await base_model.tune_deferred(
train_dataset,
validation_datasets=validation_dataset,
name=str(uuid.uuid4())
)
print(f'new {tuning_task=}')

try:
for _ in range(3):
status = await tuning_task.get_status()
print(f'{status=}')

task_info = await tuning_task.get_task_info()
print(f'{task_info=}')

await asyncio.sleep(5)
finally:
await tuning_task.cancel()

status = await tuning_task.get_status()
print(f'{status=} after cancel')

task_info = await tuning_task.get_task_info()
print(f'{task_info=} after cancel')


if __name__ == '__main__':
asyncio.run(main())
74 changes: 74 additions & 0 deletions examples/async/tuning/list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/bin/env python3

from __future__ import annotations

import asyncio
import pathlib
import uuid

from yandex_cloud_ml_sdk import AsyncYCloudML


def local_path(path: str) -> pathlib.Path:
return pathlib.Path(__file__).parent / path


async def get_datasets(sdk):
"""
This function represents getting or creating datasets object.
In real life you could use just a datasets ids, for example:
```
dataset = await sdk.datasets.get("some_id")
tuning_task = await base_model.tune_deferred(
"dataset_id",
validation_datasets=dataset
)
```
"""

async for dataset in sdk.datasets.list(status="READY"):
print(f'using old dataset {dataset=}')
break
else:
print('no old datasets found, creating new one')
dataset_draft = sdk.datasets.completions.from_path_deferred(
path=local_path('example_dataset'),
upload_format='jsonlines',
name='foo',
)

operation = await dataset_draft.upload()
dataset = await operation
print(f'created new dataset {dataset=}')

return dataset, dataset


async def main() -> None:
sdk = AsyncYCloudML(folder_id='b1ghsjum2v37c2un8h64')
train_dataset, validation_dataset = await get_datasets(sdk)
base_model = sdk.models.completions('yandexgpt-lite')

task_ids = set()
for _ in range(1):
tuning_task = await base_model.tune_deferred(
train_dataset,
validation_datasets=validation_dataset,
name=str(uuid.uuid4())
)
task_ids.add(tuning_task.id)

# NB: tuning tasks have a time gap, before they will
# be available at the backend as a `TuningTasks`
await asyncio.sleep(5)

async for tuning_task in sdk.tuning.list():
# or you could wait for tasks, instead of canceling
print(f'found task {tuning_task=}, canceling')
await tuning_task.cancel()


if __name__ == '__main__':
asyncio.run(main())
Loading

0 comments on commit 1eadd9d

Please sign in to comment.