Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tracking to data file type column names 2. #553

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 57 additions & 55 deletions isatools/isatab/dump/write.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
)
from isatools.isatab.defaults import log
from isatools.isatab.graph import _all_end_to_end_paths, _longest_path_and_attrs
from isatools.model.utils import _build_paths_and_indexes
from isatools.isatab.utils import (
get_comment_column,
get_pv_columns,
Expand Down Expand Up @@ -260,24 +261,21 @@ def flatten(current_list):

columns = []

# start_nodes, end_nodes = _get_start_end_nodes(a_graph)
paths = _all_end_to_end_paths(
a_graph, [x for x in a_graph.nodes()
if isinstance(a_graph.indexes[x], Sample)])
paths, indexes = _build_paths_and_indexes(assay_obj.process_sequence)
if len(paths) == 0:
log.info("No paths found, skipping writing assay file")
continue
if _longest_path_and_attrs(paths, a_graph.indexes) is None:
if _longest_path_and_attrs(paths, indexes) is None:
raise IOError(
"Could not find any valid end-to-end paths in assay graph")

protocol_in_path_count = 0
for node_index in _longest_path_and_attrs(paths, a_graph.indexes):
node = a_graph.indexes[node_index]
output_label_in_path_counts = {}
name_label_in_path_counts = {}
for node_index in _longest_path_and_attrs(paths, indexes):
node = indexes[node_index]
if isinstance(node, Sample):
olabel = "Sample Name"
# olabel = "Sample Name.{}".format(sample_in_path_count)
# sample_in_path_count += 1
columns.append(olabel)
columns += flatten(
map(lambda x: get_comment_column(olabel, x),
Expand Down Expand Up @@ -307,28 +305,22 @@ def flatten(current_list):
protocol_type = node.executes_protocol.protocol_type.lower()

if protocol_type in protocol_types_dict and\
protocol_types_dict[protocol_type][HEADER]:
protocol_types_dict[protocol_type][HEADER]:
oname_label = protocol_types_dict[protocol_type][HEADER]
else:
oname_label = None

if oname_label is not None:
columns.append(oname_label)

if node.executes_protocol.protocol_type.term.lower() in \
protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
if oname_label not in name_label_in_path_counts:
name_label_in_path_counts[oname_label] = 0
new_oname_label = oname_label + "." + str(name_label_in_path_counts[oname_label])
columns.append(new_oname_label)
name_label_in_path_counts[oname_label] += 1
if protocol_type in protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
columns.append("Array Design REF")

columns += flatten(
map(lambda x: get_comment_column(olabel, x),
node.comments))

for output in [x for x in node.outputs if isinstance(x, DataFile)]:
if output.label not in columns:
columns.append(output.label)
columns += flatten(
map(lambda x: get_comment_column(output.label, x),
output.comments))
elif isinstance(node, Material):
olabel = node.type
columns.append(olabel)
Expand All @@ -340,7 +332,16 @@ def flatten(current_list):
node.comments))

elif isinstance(node, DataFile):
pass # handled in process
output_label = node.label
if output_label not in output_label_in_path_counts:
output_label_in_path_counts[output_label] = 0
new_output_label = output_label + "." + str(output_label_in_path_counts[output_label])

columns.append(new_output_label)
output_label_in_path_counts[output_label] += 1
columns += flatten(
map(lambda x: get_comment_column(new_output_label, x),
node.comments))

omap = get_object_column_map(columns, columns)

Expand All @@ -355,8 +356,10 @@ def pbar(x):
df_dict[k].extend([""])

protocol_in_path_count = 0
output_label_in_path_counts = {}
name_label_in_path_counts = {}
for node_index in path_:
node = a_graph.indexes[node_index]
node = indexes[node_index]
if isinstance(node, Process):
olabel = "Protocol REF.{}".format(protocol_in_path_count)
protocol_in_path_count += 1
Expand All @@ -368,18 +371,19 @@ def pbar(x):
protocol_type = node.executes_protocol.protocol_type.lower()

if protocol_type in protocol_types_dict and\
protocol_types_dict[protocol_type][HEADER]:
protocol_types_dict[protocol_type][HEADER]:
oname_label = protocol_types_dict[protocol_type][HEADER]
else:
oname_label = None

if oname_label is not None:
df_dict[oname_label][-1] = node.name
if oname_label not in name_label_in_path_counts:
name_label_in_path_counts[oname_label] = 0

new_oname_label = oname_label + "." + str(name_label_in_path_counts[oname_label])
df_dict[new_oname_label][-1] = node.name
name_label_in_path_counts[oname_label] += 1

if protocol_type in protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
df_dict["Array Design REF"][-1] = \
node.array_design_ref

if node.executes_protocol.protocol_type.term.lower() in \
protocol_types_dict["nucleic acid hybridization"][SYNONYMS]:
df_dict["Array Design REF"][-1] = node.array_design_ref

if node.date is not None:
df_dict[olabel + ".Date"][-1] = node.date
if node.performer is not None:
Expand All @@ -391,23 +395,8 @@ def pbar(x):
colabel = "{0}.Comment[{1}]".format(olabel, co.name)
df_dict[colabel][-1] = co.value

for output in [x for x in node.outputs if isinstance(x, DataFile)]:
output_by_type = []
delim = ";"
olabel = output.label
if output.label not in columns:
columns.append(output.label)
output_by_type.append(output.filename)
df_dict[olabel][-1] = delim.join(map(str, output_by_type))

for co in output.comments:
colabel = "{0}.Comment[{1}]".format(olabel, co.name)
df_dict[colabel][-1] = co.value

elif isinstance(node, Sample):
olabel = "Sample Name"
# olabel = "Sample Name.{}".format(sample_in_path_count)
# sample_in_path_count += 1
df_dict[olabel][-1] = node.name
for co in node.comments:
colabel = "{0}.Comment[{1}]".format(
Expand All @@ -434,7 +423,17 @@ def pbar(x):
df_dict[colabel][-1] = co.value

elif isinstance(node, DataFile):
pass # handled in process
output_label = node.label
if output_label not in output_label_in_path_counts:
output_label_in_path_counts[output_label] = 0
new_output_label = output_label + "." + str(output_label_in_path_counts[output_label])
df_dict[new_output_label][-1] = node.filename
output_label_in_path_counts[output_label] += 1

for co in node.comments:
colabel = "{0}.Comment[{1}]".format(
new_output_label, co.name)
df_dict[colabel][-1] = co.value

DF = DataFrame(columns=columns)
DF = DF.from_dict(data=df_dict)
Expand Down Expand Up @@ -482,6 +481,11 @@ def pbar(x):
columns[i] = "Protocol REF"
elif "." in col:
columns[i] = col[:col.rindex(".")]
else:
for output_label in output_label_in_path_counts:
if output_label in col:
columns[i] = output_label
break

log.debug("Rendered {} paths".format(len(DF.index)))
if len(DF.index) > 1:
Expand Down Expand Up @@ -521,8 +525,6 @@ def write_value_columns(df_dict, label, x):
elif x.unit.term_source.name:
df_dict[label + ".Unit.Term Source REF"][-1] = x.unit.term_source.name

# df_dict[label + ".Unit.Term Source REF"][-1] = \
# x.unit.term_source.name if x.unit.term_source else ""
df_dict[label + ".Unit.Term Accession Number"][-1] = \
x.unit.term_accession
else:
Expand Down
8 changes: 5 additions & 3 deletions isatools/isatab/load/ProcessSequenceFactory.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from isatools.isatab.utils import process_keygen, find_lt, find_gt, pairwise, get_object_column_map, get_value
from isatools.isatab.defaults import (
log,
Expand Down Expand Up @@ -146,7 +148,7 @@ def create_from_df(self, DF):
except KeyError:
pass

for data_col in [x for x in DF.columns if x.endswith(" File")]:
for data_col in [x for x in DF.columns if x in _LABELS_DATA_NODES]:
filenames = [x for x in DF[data_col].drop_duplicates() if x != '']
data.update(dict(map(lambda x: (':'.join([data_col, x]), DataFile(filename=x, label=data_col)), filenames)))

Expand All @@ -167,7 +169,7 @@ def get_node_by_label_and_key(labl, this_key):
n = samples[lk]
elif labl in ('Extract Name', 'Labeled Extract Name'):
n = other_material[lk]
elif labl.endswith(' File'):
elif labl in _LABELS_DATA_NODES:
n = data[lk]
return n

Expand Down Expand Up @@ -410,7 +412,7 @@ def get_node_by_label_and_key(labl, this_key):
process_key = process_keygen(protocol_ref, column_group, _cg, DF.columns, object_series, _, DF)
process_key_sequence.append(process_key)

if object_label.endswith(' File'):
if object_label in _LABELS_DATA_NODES:
data_node = None
try:
data_node = get_node_by_label_and_key(object_label, str(object_series[object_label]))
Expand Down
2 changes: 1 addition & 1 deletion isatools/isatab/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@ def get_object_column_map(isatab_header, df_columns):
"""
labels = _LABELS_MATERIAL_NODES + _LABELS_DATA_NODES
if set(isatab_header) == set(df_columns):
object_index = [i for i, x in enumerate(df_columns) if x in labels or 'Protocol REF' in x]
object_index = [i for i, x in enumerate(df_columns) if x in labels or 'Protocol REF' in x or ' File' in x]
else:
object_index = [i for i, x in enumerate(isatab_header) if x in labels + ['Protocol REF']]

Expand Down
Loading