Add tuning system (#36)

Now only for completions
yandex-cloud · Dec 6, 2024 · 1eadd9d · 1eadd9d
1 parent 0c1ef1f
commit 1eadd9d
Show file tree

Hide file tree

Showing 52 changed files with 2,289 additions and 84 deletions.
diff --git a/examples/async/tuning/attach.py b/examples/async/tuning/attach.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import asyncio
+import pathlib
+import uuid
+
+from yandex_cloud_ml_sdk import AsyncYCloudML
+
+
+def local_path(path: str) -> pathlib.Path:
+    return pathlib.Path(__file__).parent / path
+
+
+async def get_datasets(sdk):
+    """
+    This function represents getting or creating datasets object.
+
+    In real life you could use just a datasets ids, for example:
+
+    ```
+    dataset = await sdk.datasets.get("some_id")
+    tuning_task = await base_model.tune_deferred(
+        "dataset_id",
+        validation_datasets=dataset
+    )
+    ```
+    """
+
+    async for dataset in sdk.datasets.list(status="READY"):
+        print(f'using old dataset {dataset=}')
+        break
+    else:
+        print('no old datasets found, creating new one')
+        dataset_draft = sdk.datasets.completions.from_path_deferred(
+            path=local_path('example_dataset'),
+            upload_format='jsonlines',
+            name='foo',
+        )
+
+        operation = await dataset_draft.upload()
+        dataset = await operation
+        print(f'created new dataset {dataset=}')
+
+    return dataset, dataset
+
+
+async def main() -> None:
+    sdk = AsyncYCloudML(folder_id='b1ghsjum2v37c2un8h64')
+    train_dataset, validation_dataset = await get_datasets(sdk)
+    base_model = sdk.models.completions('yandexgpt-lite')
+
+    tuning_task = await base_model.tune_deferred(
+        train_dataset,
+        validation_datasets=validation_dataset,
+        name=str(uuid.uuid4())
+    )
+    print(f'new {tuning_task=}')
+
+    try:
+        same_task = await base_model.attach_tune_deferred(tuning_task.id)
+        print(f'{same_task=}')
+
+        # IMPORTANT
+        # .get will raise NOT_FOUND first few seconds, before Yandex Cloud "Operation"
+        # will create a "TuningTask" at the backend.
+        await asyncio.sleep(5)
+
+        same_task2 = await sdk.tuning.get(tuning_task.id)
+        print(f'{same_task2=}')
+    finally:
+        await tuning_task.cancel()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/examples/async/tuning/basic_train_and_use.py b/examples/async/tuning/basic_train_and_use.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import asyncio
+import pathlib
+import uuid
+
+from yandex_cloud_ml_sdk import AsyncYCloudML
+
+
+def local_path(path: str) -> pathlib.Path:
+    return pathlib.Path(__file__).parent / path
+
+
+async def get_datasets(sdk):
+    """
+    This function represents getting or creating datasets object.
+
+    In real life you could use just a datasets ids, for example:
+
+    ```
+    dataset = await sdk.datasets.get("some_id")
+    tuning_task = await base_model.tune_deferred(
+        "dataset_id",
+        validation_datasets=dataset
+    )
+    ```
+    """
+
+    async for dataset in sdk.datasets.list(status="READY"):
+        print(f'using old dataset {dataset=}')
+        break
+    else:
+        print('no old datasets found, creating new one')
+        dataset_draft = sdk.datasets.completions.from_path_deferred(
+            path=local_path('example_dataset'),
+            upload_format='jsonlines',
+            name='foo',
+        )
+
+        operation = await dataset_draft.upload()
+        dataset = await operation
+        print(f'created new dataset {dataset=}')
+
+    return dataset, dataset
+
+
+async def main() -> None:
+    sdk = AsyncYCloudML(folder_id='b1ghsjum2v37c2un8h64')
+    train_dataset, validation_dataset = await get_datasets(sdk)
+    base_model = sdk.models.completions('yandexgpt-lite')
+
+    # `.tune(...)` is a shortcut for:
+    # tuning_task = await base_model.tune_deferred(...)
+    # new_model = await tuning_task.wait(...)
+    # But it gives you less control on tune canceling and
+    # reporting.
+    new_model = await base_model.tune(
+        train_dataset,
+        validation_datasets=validation_dataset,
+        name=str(uuid.uuid4())
+    )
+    print(f'resulting {new_model}')
+
+    completion_result = await new_model.run("hey!")
+    print(f'{completion_result=}')
+
+    # or save model.uri somewhere and reuse it later
+    tuned_uri = new_model.uri
+    model = sdk.models.completions(tuned_uri)
+
+    completion_result = await model.run("hey!")
+    print(f'{completion_result=}')
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/examples/async/tuning/cancel.py b/examples/async/tuning/cancel.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import asyncio
+import pathlib
+import uuid
+
+from yandex_cloud_ml_sdk import AsyncYCloudML
+
+
+def local_path(path: str) -> pathlib.Path:
+    return pathlib.Path(__file__).parent / path
+
+
+async def get_datasets(sdk):
+    """
+    This function represents getting or creating datasets object.
+
+    In real life you could use just a datasets ids, for example:
+
+    ```
+    dataset = await sdk.datasets.get("some_id")
+    tuning_task = await base_model.tune_deferred(
+        "dataset_id",
+        validation_datasets=dataset
+    )
+    ```
+    """
+
+    async for dataset in sdk.datasets.list(status="READY"):
+        print(f'using old dataset {dataset=}')
+        break
+    else:
+        print('no old datasets found, creating new one')
+        dataset_draft = sdk.datasets.completions.from_path_deferred(
+            path=local_path('example_dataset'),
+            upload_format='jsonlines',
+            name='foo',
+        )
+
+        operation = await dataset_draft.upload()
+        dataset = await operation
+        print(f'created new dataset {dataset=}')
+
+    return dataset, dataset
+
+
+async def main() -> None:
+    sdk = AsyncYCloudML(folder_id='b1ghsjum2v37c2un8h64')
+    train_dataset, validation_dataset = await get_datasets(sdk)
+    base_model = sdk.models.completions('yandexgpt-lite')
+
+    tuning_task = await base_model.tune_deferred(
+        train_dataset,
+        validation_datasets=validation_dataset,
+        name=str(uuid.uuid4())
+    )
+    print(f'new {tuning_task=}')
+
+    try:
+        for _ in range(3):
+            status = await tuning_task.get_status()
+            print(f'{status=}')
+
+            task_info = await tuning_task.get_task_info()
+            print(f'{task_info=}')
+
+            await asyncio.sleep(5)
+    finally:
+        await tuning_task.cancel()
+
+    status = await tuning_task.get_status()
+    print(f'{status=} after cancel')
+
+    task_info = await tuning_task.get_task_info()
+    print(f'{task_info=} after cancel')
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/examples/async/tuning/list.py b/examples/async/tuning/list.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import asyncio
+import pathlib
+import uuid
+
+from yandex_cloud_ml_sdk import AsyncYCloudML
+
+
+def local_path(path: str) -> pathlib.Path:
+    return pathlib.Path(__file__).parent / path
+
+
+async def get_datasets(sdk):
+    """
+    This function represents getting or creating datasets object.
+
+    In real life you could use just a datasets ids, for example:
+
+    ```
+    dataset = await sdk.datasets.get("some_id")
+    tuning_task = await base_model.tune_deferred(
+        "dataset_id",
+        validation_datasets=dataset
+    )
+    ```
+    """
+
+    async for dataset in sdk.datasets.list(status="READY"):
+        print(f'using old dataset {dataset=}')
+        break
+    else:
+        print('no old datasets found, creating new one')
+        dataset_draft = sdk.datasets.completions.from_path_deferred(
+            path=local_path('example_dataset'),
+            upload_format='jsonlines',
+            name='foo',
+        )
+
+        operation = await dataset_draft.upload()
+        dataset = await operation
+        print(f'created new dataset {dataset=}')
+
+    return dataset, dataset
+
+
+async def main() -> None:
+    sdk = AsyncYCloudML(folder_id='b1ghsjum2v37c2un8h64')
+    train_dataset, validation_dataset = await get_datasets(sdk)
+    base_model = sdk.models.completions('yandexgpt-lite')
+
+    task_ids = set()
+    for _ in range(1):
+        tuning_task = await base_model.tune_deferred(
+            train_dataset,
+            validation_datasets=validation_dataset,
+            name=str(uuid.uuid4())
+        )
+        task_ids.add(tuning_task.id)
+
+    # NB: tuning tasks have a time gap, before they will
+    # be available at the backend as a `TuningTasks`
+    await asyncio.sleep(5)
+
+    async for tuning_task in sdk.tuning.list():
+        # or you could wait for tasks, instead of canceling
+        print(f'found task {tuning_task=}, canceling')
+        await tuning_task.cancel()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())