Skip to content

Commit

Permalink
🎨 format all code using autopep8
Browse files Browse the repository at this point in the history
- both scripts (notebooks)
- and library code
  • Loading branch information
Henry committed Sep 18, 2023
1 parent a2b97b8 commit 13dba85
Show file tree
Hide file tree
Showing 150 changed files with 3,363 additions and 2,479 deletions.
65 changes: 31 additions & 34 deletions project/00_0_0_lftp_upload_commands.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,11 @@
},
"outputs": [],
"source": [
"fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow\n",
"fn_mq_summaries: str = 'data/samples_selected_summaries.csv' # MaxQuant summary files\n",
"fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides\n",
"out_folder: str = 'data/rename' # output folder\n",
"fn_server_log: str = 'data/rename/mq_out_server.log' # server log of all uploaded files"
"fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow\n",
"fn_mq_summaries: str = 'data/samples_selected_summaries.csv' # MaxQuant summary files\n",
"fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides\n",
"out_folder: str = 'data/rename' # output folder\n",
"fn_server_log: str = 'data/rename/mq_out_server.log' # server log of all uploaded files"
]
},
{
Expand Down Expand Up @@ -133,14 +133,14 @@
"outputs": [],
"source": [
"cols_identifies = [('FileProperties', 'Pathname'),\n",
" ('FileProperties', 'Version'),\n",
" ('FileProperties', 'Content Creation Date'),\n",
" ('InstrumentProperties', 'Thermo Scientific instrument model'),\n",
" ('InstrumentProperties', 'instrument attribute'),\n",
" ('InstrumentProperties', 'instrument serial number'),\n",
" ('InstrumentProperties', 'Software Version'),\n",
" ('InstrumentProperties', 'firmware version'),\n",
"]\n",
" ('FileProperties', 'Version'),\n",
" ('FileProperties', 'Content Creation Date'),\n",
" ('InstrumentProperties', 'Thermo Scientific instrument model'),\n",
" ('InstrumentProperties', 'instrument attribute'),\n",
" ('InstrumentProperties', 'instrument serial number'),\n",
" ('InstrumentProperties', 'Software Version'),\n",
" ('InstrumentProperties', 'firmware version'),\n",
" ]\n",
"\n",
"df_meta = df_meta[cols_identifies]\n",
"df_meta.columns = [t[-1] for t in cols_identifies]\n",
Expand Down Expand Up @@ -198,9 +198,9 @@
"source": [
"date_col = \"Content Creation Date\"\n",
"idx_all = (pd.to_datetime(df_meta[date_col]).dt.strftime(\"%Y_%m_%d_%H_%M\")\n",
" + '_'\n",
" + df_meta[\"Instrument_name\"]\n",
").str.replace(' ', '-')\n",
" + '_'\n",
" + df_meta[\"Instrument_name\"]\n",
" ).str.replace(' ', '-')\n",
"\n",
"mask = idx_all.duplicated(keep=False)\n",
"duplicated_sample_idx = idx_all.loc[mask].sort_values() # duplicated dumps\n",
Expand All @@ -214,8 +214,7 @@
"metadata": {},
"outputs": [],
"source": [
"df_meta['new_sample_id'] = idx_all\n",
"\n",
"df_meta['new_sample_id'] = idx_all\n",
"\n",
"\n",
"_n = df_meta.groupby(\"new_sample_id\").cumcount().astype('string').str.replace('0', '')\n",
Expand Down Expand Up @@ -340,10 +339,10 @@
" if string_ not in used_before:\n",
" ret += f'_{string_}'\n",
" used_before |= set(strings_)\n",
" ret = (ret[1:] # remove _ from start\n",
" ret = (ret[1:] # remove _ from start\n",
" .replace('Slot_#', '')\n",
" .replace('slot_#', '')\n",
" )\n",
" )\n",
" return ret\n",
"\n",
"\n",
Expand All @@ -353,7 +352,7 @@
" \"instrument attribute\",\n",
" \"instrument serial number\",\n",
" ]\n",
" ]\n",
"]\n",
" .sample(20)\n",
" .apply(build_instrument_name, axis=1)\n",
")"
Expand Down Expand Up @@ -401,8 +400,8 @@
" .loc[selected, \"Path_old\"]\n",
" .iloc[:3]\n",
" .to_csv(out_folder / 'rawfiles_to_checksum.txt',\n",
" index=False,\n",
" header=False)\n",
" index=False,\n",
" header=False)\n",
" )"
]
},
Expand Down Expand Up @@ -453,7 +452,7 @@
"```\n",
"to allow parallell commands, use the runtime setting\n",
"```bash\n",
">>> cat ~/.lftprc \n",
">>> cat ~/.lftprc\n",
"set cmd:parallel 2\n",
"```"
]
Expand Down Expand Up @@ -501,11 +500,11 @@
"source": [
"commands = df_meta.loc[selected]\n",
"commands = (\n",
" 'put ' \n",
" 'put '\n",
" + commands['Path_old'].astype('string')\n",
" + ' -o ' \n",
" + \"./raw_files/\" \n",
" + commands[\"Instrument_name\"] \n",
" + ' -o '\n",
" + \"./raw_files/\"\n",
" + commands[\"Instrument_name\"]\n",
" + '/'\n",
" + commands['new_sample_id'] + '.raw'\n",
")\n",
Expand Down Expand Up @@ -559,9 +558,9 @@
"source": [
"commands = df_meta.loc[selected]\n",
"commands = (\n",
" \"mirror -R --only-missing --log log_lftp_mirror.log --exclude-glob *.pdf \" # command\n",
" + \"mq_out/\" + commands.index # source\n",
" + \" ./MQ_tables/\" + commands[\"Instrument_name\"]+ \"/\" + commands[\"new_sample_id\"] # dest\n",
" \"mirror -R --only-missing --log log_lftp_mirror.log --exclude-glob *.pdf \" # command\n",
" + \"mq_out/\" + commands.index # source\n",
" + \" ./MQ_tables/\" + commands[\"Instrument_name\"] + \"/\" + commands[\"new_sample_id\"] # dest\n",
")\n",
"\n",
"print(commands.sample(10).to_csv(header=False, index=False))"
Expand All @@ -579,9 +578,7 @@
"cell_type": "code",
"execution_count": null,
"id": "83c04b90-0c4e-4fe7-88f6-ed02cef93a23",
"metadata": {
"lines_to_next_cell": 2
},
"metadata": {},
"outputs": [],
"source": [
"fname = out_folder / 'lftp_commands_mq_output.txt'\n",
Expand Down
62 changes: 30 additions & 32 deletions project/00_0_0_lftp_upload_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@ def rename(fname, new_sample_id, new_folder=None, ext=None):
# ## Arguments

# %% tags=["parameters"]
fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow
fn_mq_summaries: str = 'data/samples_selected_summaries.csv' # MaxQuant summary files
fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides
out_folder: str = 'data/rename' # output folder
fn_server_log: str = 'data/rename/mq_out_server.log' # server log of all uploaded files
fn_rawfile_metadata: str = 'data/rawfile_metadata.csv' # Machine parsed metadata from rawfile workflow
fn_mq_summaries: str = 'data/samples_selected_summaries.csv' # MaxQuant summary files
fn_files_selected: str = 'data/samples_selected.yaml' # selected files based on threshold of identified peptides
out_folder: str = 'data/rename' # output folder
fn_server_log: str = 'data/rename/mq_out_server.log' # server log of all uploaded files

# %%
out_folder = Path(out_folder)
Expand Down Expand Up @@ -79,14 +79,14 @@ def rename(fname, new_sample_id, new_folder=None, ext=None):

# %%
cols_identifies = [('FileProperties', 'Pathname'),
('FileProperties', 'Version'),
('FileProperties', 'Content Creation Date'),
('InstrumentProperties', 'Thermo Scientific instrument model'),
('InstrumentProperties', 'instrument attribute'),
('InstrumentProperties', 'instrument serial number'),
('InstrumentProperties', 'Software Version'),
('InstrumentProperties', 'firmware version'),
]
('FileProperties', 'Version'),
('FileProperties', 'Content Creation Date'),
('InstrumentProperties', 'Thermo Scientific instrument model'),
('InstrumentProperties', 'instrument attribute'),
('InstrumentProperties', 'instrument serial number'),
('InstrumentProperties', 'Software Version'),
('InstrumentProperties', 'firmware version'),
]

df_meta = df_meta[cols_identifies]
df_meta.columns = [t[-1] for t in cols_identifies]
Expand All @@ -113,17 +113,16 @@ def rename(fname, new_sample_id, new_folder=None, ext=None):
# %%
date_col = "Content Creation Date"
idx_all = (pd.to_datetime(df_meta[date_col]).dt.strftime("%Y_%m_%d_%H_%M")
+ '_'
+ df_meta["Instrument_name"]
).str.replace(' ', '-')
+ '_'
+ df_meta["Instrument_name"]
).str.replace(' ', '-')

mask = idx_all.duplicated(keep=False)
duplicated_sample_idx = idx_all.loc[mask].sort_values() # duplicated dumps
duplicated_sample_idx

# %%
df_meta['new_sample_id'] = idx_all

df_meta['new_sample_id'] = idx_all


_n = df_meta.groupby("new_sample_id").cumcount().astype('string').str.replace('0', '')
Expand Down Expand Up @@ -182,10 +181,10 @@ def build_instrument_name(s):
if string_ not in used_before:
ret += f'_{string_}'
used_before |= set(strings_)
ret = (ret[1:] # remove _ from start
ret = (ret[1:] # remove _ from start
.replace('Slot_#', '')
.replace('slot_#', '')
)
)
return ret


Expand All @@ -195,7 +194,7 @@ def build_instrument_name(s):
"instrument attribute",
"instrument serial number",
]
]
]
.sample(20)
.apply(build_instrument_name, axis=1)
)
Expand All @@ -217,8 +216,8 @@ def build_instrument_name(s):
.loc[selected, "Path_old"]
.iloc[:3]
.to_csv(out_folder / 'rawfiles_to_checksum.txt',
index=False,
header=False)
index=False,
header=False)
)

# %% [markdown]
Expand Down Expand Up @@ -247,7 +246,7 @@ def build_instrument_name(s):
# ```
# to allow parallell commands, use the runtime setting
# ```bash
# >>> cat ~/.lftprc
# >>> cat ~/.lftprc
# set cmd:parallel 2
# ```

Expand All @@ -269,11 +268,11 @@ def build_instrument_name(s):
# %%
commands = df_meta.loc[selected]
commands = (
'put '
'put '
+ commands['Path_old'].astype('string')
+ ' -o '
+ "./raw_files/"
+ commands["Instrument_name"]
+ ' -o '
+ "./raw_files/"
+ commands["Instrument_name"]
+ '/'
+ commands['new_sample_id'] + '.raw'
)
Expand All @@ -299,9 +298,9 @@ def build_instrument_name(s):
# %%
commands = df_meta.loc[selected]
commands = (
"mirror -R --only-missing --log log_lftp_mirror.log --exclude-glob *.pdf " # command
+ "mq_out/" + commands.index # source
+ " ./MQ_tables/" + commands["Instrument_name"]+ "/" + commands["new_sample_id"] # dest
"mirror -R --only-missing --log log_lftp_mirror.log --exclude-glob *.pdf " # command
+ "mq_out/" + commands.index # source
+ " ./MQ_tables/" + commands["Instrument_name"] + "/" + commands["new_sample_id"] # dest
)

print(commands.sample(10).to_csv(header=False, index=False))
Expand All @@ -312,4 +311,3 @@ def build_instrument_name(s):
# %%
fname = out_folder / 'lftp_commands_mq_output.txt'
commands.to_csv(fname, header=False, index=False)

22 changes: 10 additions & 12 deletions project/00_0_1_check_filesizes.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@
"source": [
"mask = (entries['size_pride'] - entries['size_erda']).abs() > 5\n",
"to_redo = entries.loc[mask].reset_index()\n",
"to_redo "
"to_redo"
]
},
{
Expand All @@ -172,7 +172,7 @@
"id": "b6087751",
"metadata": {},
"source": [
"## Check MaxQuant output filesizes "
"## Check MaxQuant output filesizes"
]
},
{
Expand Down Expand Up @@ -207,7 +207,7 @@
" files.append(entry)\n",
" if entry.id_old not in folder:\n",
" folder.add(entry.id_old)\n",
" \n",
"\n",
"print(f\"{len(folder) =: }\")\n",
"print(f\"{len(files) =: }\")\n",
"files[:3]"
Expand Down Expand Up @@ -235,11 +235,11 @@
"outputs": [],
"source": [
"files['path_pride'] = ('MQ_tables/'\n",
" + files['Instrument_name']\n",
" + '/' \n",
" + files[\"new_sample_id\"]\n",
" + '/'\n",
" + files[\"filename\"])\n",
" + files['Instrument_name']\n",
" + '/'\n",
" + files[\"new_sample_id\"]\n",
" + '/'\n",
" + files[\"filename\"])\n",
"files['path_pride'].iloc[:4].to_list()"
]
},
Expand All @@ -250,7 +250,7 @@
"metadata": {},
"outputs": [],
"source": [
"files['filename'].value_counts() # except mqpar.xml all present on erda"
"files['filename'].value_counts() # except mqpar.xml all present on erda"
]
},
{
Expand Down Expand Up @@ -359,9 +359,7 @@
"cell_type": "code",
"execution_count": null,
"id": "3fc22aef",
"metadata": {
"lines_to_next_cell": 2
},
"metadata": {},
"outputs": [],
"source": [
"to_do = pd.concat([missing_on_pride, files_redo])\n",
Expand Down
Loading

0 comments on commit 13dba85

Please sign in to comment.