Skip to content

Commit

Permalink
Set notebook directory as current directory for pip installs from wor…
Browse files Browse the repository at this point in the history
…kspace fs (#961)

When users use a relative path to install a notebook scoped library with
pip (eg `%pip install ../../foo`) it can fail since cwd for shell
commands can be an ephemeral directory in DBRs < 14.0.
For other shell commands, our preamble handles the cwd, but pip commands
are moved to the beginning of the notebook, since they can lead to a
kernel restart and hence loss of preamble. This means, they can't access
the features of the full preamble and need an explicit `os.chdir` to set
the correct path.

## Changes
* Add an explicit `os.chdir(notebook_dir)` before each `%pip install`. 
* Also remove code to pass source_file and project_root to the wrappers
as notebook params (for notebook jobs) and cli args (for python file
jobs). Now these parameters are hard coded into the wrapper. These are
updated on every run since the wrapper is recreated on every run.

## Tests
* manual

Fixes #958
  • Loading branch information
kartikgupta-db authored Dec 5, 2023
1 parent c9864e1 commit eec87b8
Show file tree
Hide file tree
Showing 5 changed files with 124 additions and 116 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,8 @@
import sys
import os

databricks_arg_idx = []
for i, arg in enumerate(sys.argv):
if i == 0:
continue
if sys.argv[i-1] == "--databricks-source-file":
python_file = arg
elif sys.argv[i-1] == "--databricks-project-root":
project_root = arg
else:
continue
databricks_arg_idx.extend([i, i-1])

if python_file is None:
raise Exception("--databricks-source-file argument not specified")

if project_root is None:
raise Exception("--databricks-project-root argument not specified")

#remove databricks args from argv
sys.argv = [value for i,value in enumerate(sys.argv) if i not in databricks_arg_idx]
python_file = '{{DATABRICKS_SOURCE_FILE}}'
project_root = '{{DATABRICKS_PROJECT_ROOT}}'

# change working directory
os.chdir(os.path.dirname(python_file))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,17 @@ def databricks_preamble():
from IPython import get_ipython
from typing import List
from shlex import quote
import os
import sys

dbutils.widgets.text("DATABRICKS_SOURCE_FILE",
"{{DATABRICKS_SOURCE_FILE}}")
dbutils.widgets.text("DATABRICKS_PROJECT_ROOT",
"{{DATABRICKS_PROJECT_ROOT}}")
src_file_dir = None
project_root_dir = None
src_file_dir = os.path.dirname("{{DATABRICKS_SOURCE_FILE}}")
os.chdir(src_file_dir)

if dbutils.widgets.get("DATABRICKS_SOURCE_FILE") != "":
import os
src_file_dir = os.path.dirname(
dbutils.widgets.get("DATABRICKS_SOURCE_FILE"))
os.chdir(src_file_dir)

if dbutils.widgets.get("DATABRICKS_PROJECT_ROOT") != "":
import sys
project_root_dir = dbutils.widgets.get("DATABRICKS_PROJECT_ROOT")
sys.path.insert(0, project_root_dir)
project_root_dir = "{{DATABRICKS_PROJECT_ROOT}}"
sys.path.insert(0, project_root_dir)

def parse_databricks_magic_lines(lines: List[str]):
if len(lines) == 0 or src_file_dir is None:
if len(lines) == 0:
return lines

first = ""
Expand Down
23 changes: 6 additions & 17 deletions packages/databricks-vscode/src/run/WorkflowRunner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -173,16 +173,7 @@ export class WorkflowRunner implements Disposable {
panel.showExportedRun(
await cluster.runNotebookAndWait({
path: remoteFilePath,
parameters: {
// eslint-disable-next-line @typescript-eslint/naming-convention
DATABRICKS_SOURCE_FILE:
syncDestination.localToRemote(program)
.workspacePrefixPath,
// eslint-disable-next-line @typescript-eslint/naming-convention
DATABRICKS_PROJECT_ROOT:
syncDestination.remoteUri.workspacePrefixPath,
...parameters,
},
parameters,
onProgress: (
state: jobs.RunLifeCycleState,
run: WorkflowRun
Expand All @@ -200,16 +191,14 @@ export class WorkflowRunner implements Disposable {
? await new WorkspaceFsWorkflowWrapper(
this.connectionManager,
this.context
).createPythonFileWrapper(originalFileUri)
).createPythonFileWrapper(
originalFileUri,
syncDestination.remoteUri
)
: undefined;
const response = await cluster.runPythonAndWait({
path: wrappedFile ? wrappedFile.path : originalFileUri.path,
args: (args ?? []).concat([
"--databricks-source-file",
originalFileUri.workspacePrefixPath,
"--databricks-project-root",
syncDestination.remoteUri.workspacePrefixPath,
]),
args: args ?? [],
onProgress: (
state: jobs.RunLifeCycleState,
run: WorkflowRun
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,25 @@ describe(__filename, async () => {
);
}

async function getWrapperData(
wrapperName:
| "notebook.workflow-wrapper.py"
| "file.workflow-wrapper.py",
originalFilePath: string
) {
return (
await readFile(path.join(resourceDirPath, wrapperName), "utf-8")
)
.replace(
"{{DATABRICKS_SOURCE_FILE}}",
new RemoteUri(originalFilePath).workspacePrefixPath
)
.replace(
"{{DATABRICKS_PROJECT_ROOT}}",
new RemoteUri(testDirPath).workspacePrefixPath
);
}

describe("python files", () => {
beforeEach(createMocks);
it("should create wrapper for files", async () => {
Expand All @@ -100,11 +119,14 @@ describe(__filename, async () => {
await new WorkspaceFsWorkflowWrapper(
instance(mockConnectionManager),
instance(mockExtensionContext)
).createPythonFileWrapper(new RemoteUri(originalFilePath));
).createPythonFileWrapper(
new RemoteUri(originalFilePath),
new RemoteUri(testDirPath)
);

const wrapperData = await readFile(
path.join(resourceDirPath, "file.workflow-wrapper.py"),
"utf-8"
const wrapperData = await getWrapperData(
"file.workflow-wrapper.py",
originalFilePath
);
verify(
mockWorkspaceService.import(
Expand Down Expand Up @@ -165,23 +187,6 @@ describe(__filename, async () => {
});
});

async function getWrapperData() {
return (
await readFile(
path.join(resourceDirPath, "notebook.workflow-wrapper.py"),
"utf-8"
)
)
.replace(
"{{DATABRICKS_SOURCE_FILE}}",
new RemoteUri(originalFilePath).workspacePrefixPath
)
.replace(
"{{DATABRICKS_PROJECT_ROOT}}",
new RemoteUri(testDirPath).workspacePrefixPath
);
}

it("should create wrapper for databricks python notebook", async () => {
await withFile(async (localFilePath) => {
const comment = ["# Databricks notebook source"];
Expand All @@ -202,7 +207,10 @@ describe(__filename, async () => {
"PY_DBNB"
);

const wrapperData = await getWrapperData();
const wrapperData = await getWrapperData(
"notebook.workflow-wrapper.py",
originalFilePath
);
const expected = comment
.concat(wrapperData.split(/\r?\n/))
.concat(["# COMMAND ----------"])
Expand Down Expand Up @@ -244,9 +252,16 @@ describe(__filename, async () => {
"PY_DBNB"
);

const wrapperData = await getWrapperData();
const wrapperData = await getWrapperData(
"notebook.workflow-wrapper.py",
originalFilePath
);
const expected = comment
.concat([
"import os",
`os.chdir(os.path.dirname('${
new RemoteUri(originalFilePath).workspacePrefixPath
}'))`,
"# MAGIC %pip install pandas",
"# COMMAND ----------",
"dbutils.library.restartPython()",
Expand Down Expand Up @@ -377,7 +392,7 @@ describe(__filename, async () => {
},
},
outputs: [],
execution_count: 0,
execution_count: null,
};

it("should create wrapper for databricks jupyter notebook", async () => {
Expand Down Expand Up @@ -448,7 +463,19 @@ describe(__filename, async () => {
const expected = {
...notebookMetadata,
cells: [
{...wrapperData, source: ["%pip install x"]},
{
...wrapperData,
source: [
[
"import os",
`os.chdir(os.path.dirname('${
new RemoteUri(originalFilePath)
.workspacePrefixPath
}'))`,
"%pip install x",
].join("\n"),
],
},
{
...wrapperData,
source: ["dbutils.library.restartPython()"],
Expand Down
Loading

0 comments on commit eec87b8

Please sign in to comment.