diff --git a/tools/add_input_name_as_column/add_input_name_as_column.py b/tools/add_input_name_as_column/add_input_name_as_column.py index 3ab71509d06..b8b8b71da3b 100644 --- a/tools/add_input_name_as_column/add_input_name_as_column.py +++ b/tools/add_input_name_as_column/add_input_name_as_column.py @@ -4,11 +4,28 @@ def Parser(): the_parser = argparse.ArgumentParser(description="add label to last column of file") - the_parser.add_argument('--input', required=True, action="store", type=str, help="input tabular file") - the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path") - the_parser.add_argument('--label', required=True, action="store", type=str, help="label to add in last column") - the_parser.add_argument('--header', action="store", type=str, help="column label for last column") - the_parser.add_argument('--prepend', action='store_true', default=False, help='Prepend column instead of appending' ) + the_parser.add_argument( + "--input", required=True, action="store", type=str, help="input tabular file" + ) + the_parser.add_argument( + "--output", required=True, action="store", type=str, help="output file path" + ) + the_parser.add_argument( + "--label", + required=True, + action="store", + type=str, + help="label to add in last column", + ) + the_parser.add_argument( + "--header", action="store", type=str, help="column label for last column" + ) + the_parser.add_argument( + "--prepend", + action="store_true", + default=False, + help="Prepend column instead of appending", + ) args = the_parser.parse_args() return args @@ -17,9 +34,11 @@ def Parser(): args = Parser() -with io.open(args.input, encoding="utf-8") as input, io.open(args.output, 'w', encoding="utf-8") as output: +with io.open(args.input, encoding="utf-8") as input, io.open( + args.output, "w", encoding="utf-8" +) as output: for i, line in enumerate(input): - line = line.strip('\n') + line = line.strip("\n") if (i == 0) and args.header: new_entry = args.header else: diff --git a/tools/column_order_header_sort/column_order_header_sort.py b/tools/column_order_header_sort/column_order_header_sort.py index 43254aad895..0fcdcc0b2a8 100644 --- a/tools/column_order_header_sort/column_order_header_sort.py +++ b/tools/column_order_header_sort/column_order_header_sort.py @@ -11,25 +11,35 @@ key_column = sys.argv[4] try: - key_column = int( key_column ) - 1 + key_column = int(key_column) - 1 except Exception: key_column = None header = None -with open( input_filename, 'r' ) as fh: - header = fh.readline().strip( '\r\n' ) -header = header.split( delimiter ) -assert len( header ) == len( set( header ) ), "Header values must be unique" -sorted_header = list( header ) +with open(input_filename, "r") as fh: + header = fh.readline().strip("\r\n") +header = header.split(delimiter) +assert len(header) == len(set(header)), "Header values must be unique" +sorted_header = list(header) if key_column is None: columns = [] else: - columns = [ key_column ] - sorted_header.pop( key_column ) + columns = [key_column] + sorted_header.pop(key_column) sorted_header.sort() for key in sorted_header: - columns.append( header.index( key ) ) + columns.append(header.index(key)) -awk_cmd = AWK_CMD % ( delimiter, delimiter, ",".join( map( lambda x: "$%i" % ( x + 1 ), columns ) ) ) -sys.exit( subprocess.call( [ 'gawk', awk_cmd, input_filename ], stdout=open( output_filename, 'wb+' ), shell=False ) ) +awk_cmd = AWK_CMD % ( + delimiter, + delimiter, + ",".join(map(lambda x: "$%i" % (x + 1), columns)), +) +sys.exit( + subprocess.call( + ["gawk", awk_cmd, input_filename], + stdout=open(output_filename, "wb+"), + shell=False, + ) +) diff --git a/tools/cwpair2/cwpair2.py b/tools/cwpair2/cwpair2.py index 1a50d75850d..0ea75ea084a 100644 --- a/tools/cwpair2/cwpair2.py +++ b/tools/cwpair2/cwpair2.py @@ -17,18 +17,63 @@ import cwpair2_util -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--input', dest='inputs', action='append', nargs=2, help="Input datasets") - parser.add_argument('--method', dest='method', default='mode', help='Method of finding match.') - parser.add_argument('--up_distance', dest='up_distance', type=int, default=50, help='Distance upstream from a pair.') - parser.add_argument('--down_distance', dest='down_distance', type=int, default=100, help='Distance downstream of a pair.') - parser.add_argument('--binsize', dest='binsize', type=int, default=1, help='Width of bins for plots and mode.') - parser.add_argument('--threshold_format', dest='threshold_format', help='Percentage to filter the 95th percentile.') - parser.add_argument('--relative_threshold', dest='relative_threshold', type=float, default=0.0, help='Percentage to filter the 95th percentile.') - parser.add_argument('--absolute_threshold', dest='absolute_threshold', type=float, default=0.0, help='Absolute value to filter.') - parser.add_argument('--output_files', dest='output_files', default='matched_pair', help='Restrict output dataset collections.') - parser.add_argument('--statistics_output', dest='statistics_output', help='Statistics output file.') + parser.add_argument( + "--input", dest="inputs", action="append", nargs=2, help="Input datasets" + ) + parser.add_argument( + "--method", dest="method", default="mode", help="Method of finding match." + ) + parser.add_argument( + "--up_distance", + dest="up_distance", + type=int, + default=50, + help="Distance upstream from a pair.", + ) + parser.add_argument( + "--down_distance", + dest="down_distance", + type=int, + default=100, + help="Distance downstream of a pair.", + ) + parser.add_argument( + "--binsize", + dest="binsize", + type=int, + default=1, + help="Width of bins for plots and mode.", + ) + parser.add_argument( + "--threshold_format", + dest="threshold_format", + help="Percentage to filter the 95th percentile.", + ) + parser.add_argument( + "--relative_threshold", + dest="relative_threshold", + type=float, + default=0.0, + help="Percentage to filter the 95th percentile.", + ) + parser.add_argument( + "--absolute_threshold", + dest="absolute_threshold", + type=float, + default=0.0, + help="Absolute value to filter.", + ) + parser.add_argument( + "--output_files", + dest="output_files", + default="matched_pair", + help="Restrict output dataset collections.", + ) + parser.add_argument( + "--statistics_output", dest="statistics_output", help="Statistics output file." + ) args = parser.parse_args() cwpair2_util.create_directories() @@ -41,14 +86,16 @@ else: threshold = 0 for (dataset_path, hid) in args.inputs: - stats = cwpair2_util.process_file(dataset_path, - hid, - args.method, - threshold, - args.up_distance, - args.down_distance, - args.binsize, - args.output_files) + stats = cwpair2_util.process_file( + dataset_path, + hid, + args.method, + threshold, + args.up_distance, + args.down_distance, + args.binsize, + args.output_files, + ) statistics.extend(stats) # Accumulate statistics. by_file = {} @@ -56,13 +103,15 @@ # Skip "None" statistics from failed files if not stats: continue - path = stats['stats_path'] + path = stats["stats_path"] if path not in by_file: by_file[path] = [] by_file[path].append(stats) # Write tabular statistics file. - keys = ['fname', 'final_mode', 'preview_mode', 'perc95', 'paired', 'orphans'] - statistics_out = csv.writer(open(args.statistics_output, 'wt'), delimiter='\t', lineterminator="\n") + keys = ["fname", "final_mode", "preview_mode", "perc95", "paired", "orphans"] + statistics_out = csv.writer( + open(args.statistics_output, "wt"), delimiter="\t", lineterminator="\n" + ) statistics_out.writerow(keys) for file_path, statistics in by_file.items(): for stats in statistics: diff --git a/tools/cwpair2/cwpair2_util.py b/tools/cwpair2/cwpair2_util.py index 0cd0ffb6a4e..c9fb2f17818 100644 --- a/tools/cwpair2/cwpair2_util.py +++ b/tools/cwpair2/cwpair2_util.py @@ -5,39 +5,39 @@ import traceback import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") from matplotlib import pyplot # noqa: I202,E402 # Data outputs -DETAILS = 'D' -MATCHED_PAIRS = 'MP' -ORPHANS = 'O' +DETAILS = "D" +MATCHED_PAIRS = "MP" +ORPHANS = "O" # Data output formats -GFF_EXT = 'gff' -TABULAR_EXT = 'tabular' +GFF_EXT = "gff" +TABULAR_EXT = "tabular" # Statistics histograms output directory. -HISTOGRAM = 'H' +HISTOGRAM = "H" # Statistics outputs -FINAL_PLOTS = 'F' -PREVIEW_PLOTS = 'P' -STATS_GRAPH = 'C' +FINAL_PLOTS = "F" +PREVIEW_PLOTS = "P" +STATS_GRAPH = "C" # Graph settings. -COLORS = 'krg' -Y_LABEL = 'Peak-pair counts' -X_LABEL = 'Peak-pair distance (bp)' +COLORS = "krg" +Y_LABEL = "Peak-pair counts" +X_LABEL = "Peak-pair distance (bp)" TICK_WIDTH = 3 ADJUST = [0.140, 0.9, 0.9, 0.1] -PLOT_FORMAT = 'pdf' -pyplot.rc('xtick.major', size=10.00) -pyplot.rc('ytick.major', size=10.00) -pyplot.rc('lines', linewidth=4.00) -pyplot.rc('axes', linewidth=3.00) -pyplot.rc('font', family='Bitstream Vera Sans', size=32.0) +PLOT_FORMAT = "pdf" +pyplot.rc("xtick.major", size=10.00) +pyplot.rc("ytick.major", size=10.00) +pyplot.rc("lines", linewidth=4.00) +pyplot.rc("axes", linewidth=3.00) +pyplot.rc("font", family="Bitstream Vera Sans", size=32.0) class FrequencyDistribution(object): - def __init__(self, start, end, binsize=10, d=None): self.start = start self.end = end @@ -48,7 +48,11 @@ def get_bin(self, x): """ Returns the centre of the bin in which a data point falls """ - return self.start + (x - self.start) // self.binsize * self.binsize + self.binsize / 2.0 + return ( + self.start + + (x - self.start) // self.binsize * self.binsize + + self.binsize / 2.0 + ) def add(self, x): x = self.get_bin(x) @@ -84,14 +88,16 @@ def distance(peak1, peak2): return (peak2[1] + peak2[2]) / 2.0 - (peak1[1] + peak1[2]) / 2.0 -def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): +def gff_row( + cname, start, end, score, source, type=".", strand=".", phase=".", attrs={} +): return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) def gff_attrs(d): if not d: - return '.' - return ';'.join('%s=%s' % item for item in d.items()) + return "." + return ";".join("%s=%s" % item for item in d.items()) def parse_chromosomes(reader): @@ -99,7 +105,7 @@ def parse_chromosomes(reader): chromosomes = {} for line in reader: line = line.rstrip("\r\n") - if not line or line.startswith('#'): + if not line or line.startswith("#"): continue cname, _, _, start, end, value, strand, _, _ = line.split("\t") start = int(start) @@ -139,8 +145,8 @@ def peak_filter(chromosomes, threshold): def split_strands(chromosome): - watson = [peak for peak in chromosome if peak[0] == '+'] - crick = [peak for peak in chromosome if peak[0] == '-'] + watson = [peak for peak in chromosome if peak[0] == "+"] + crick = [peak for peak in chromosome if peak[0] == "-"] return watson, crick @@ -192,6 +198,7 @@ def key(cpeak): # And then prefer less negative distances d = 10000 - d return d + return min(window, key=key) @@ -201,125 +208,167 @@ def match_mode(window, peak, mode): return min(window, key=lambda cpeak: abs(distance(peak, cpeak) - mode)) -METHODS = {'mode': match_mode, 'closest': match_closest, 'largest': match_largest} +METHODS = {"mode": match_mode, "closest": match_closest, "largest": match_largest} -def frequency_plot(freqs, fname, labels=[], title=''): +def frequency_plot(freqs, fname, labels=[], title=""): pyplot.clf() pyplot.figure(figsize=(10, 10)) for i, freq in enumerate(freqs): x, y = freq.graph_series() - pyplot.plot(x, y, '%s-' % COLORS[i]) + pyplot.plot(x, y, "%s-" % COLORS[i]) if len(freqs) > 1: pyplot.legend(labels) pyplot.xlim(freq.start, freq.end) pyplot.ylim(ymin=0) pyplot.ylabel(Y_LABEL) pyplot.xlabel(X_LABEL) - pyplot.subplots_adjust(left=ADJUST[0], right=ADJUST[1], top=ADJUST[2], bottom=ADJUST[3]) + pyplot.subplots_adjust( + left=ADJUST[0], right=ADJUST[1], top=ADJUST[2], bottom=ADJUST[3] + ) # Get the current axes ax = pyplot.gca() - for l in ax.get_xticklines() + ax.get_yticklines(): - l.set_markeredgewidth(TICK_WIDTH) + for line in ax.get_xticklines() + ax.get_yticklines(): + line.set_markeredgewidth(TICK_WIDTH) pyplot.savefig(fname) def create_directories(): # Output histograms in pdf. os.mkdir(HISTOGRAM) - os.mkdir('data_%s' % DETAILS) - os.mkdir('data_%s' % ORPHANS) - os.mkdir('data_%s' % MATCHED_PAIRS) - - -def process_file(dataset_path, galaxy_hid, method, threshold, up_distance, - down_distance, binsize, output_files): - if method == 'all': + os.mkdir("data_%s" % DETAILS) + os.mkdir("data_%s" % ORPHANS) + os.mkdir("data_%s" % MATCHED_PAIRS) + + +def process_file( + dataset_path, + galaxy_hid, + method, + threshold, + up_distance, + down_distance, + binsize, + output_files, +): + if method == "all": match_methods = METHODS.keys() else: match_methods = [method] statistics = [] for match_method in match_methods: - stats = perform_process(dataset_path, - galaxy_hid, - match_method, - threshold, - up_distance, - down_distance, - binsize, - output_files) + stats = perform_process( + dataset_path, + galaxy_hid, + match_method, + threshold, + up_distance, + down_distance, + binsize, + output_files, + ) statistics.append(stats) - if output_files == 'all' and method == 'all': - frequency_plot([s['dist'] for s in statistics], - statistics[0]['graph_path'], - labels=list(METHODS.keys())) + if output_files == "all" and method == "all": + frequency_plot( + [s["dist"] for s in statistics], + statistics[0]["graph_path"], + labels=list(METHODS.keys()), + ) return statistics -def perform_process(dataset_path, galaxy_hid, method, threshold, up_distance, - down_distance, binsize, output_files): +def perform_process( + dataset_path, + galaxy_hid, + method, + threshold, + up_distance, + down_distance, + binsize, + output_files, +): output_details = output_files in ["all", "matched_pair_orphan_detail"] output_plots = output_files in ["all"] - output_orphans = output_files in ["all", "matched_pair_orphan", "matched_pair_orphan_detail"] + output_orphans = output_files in [ + "all", + "matched_pair_orphan", + "matched_pair_orphan_detail", + ] # Keep track of statistics for the output file statistics = {} fpath, fname = os.path.split(dataset_path) - statistics['fname'] = '%s: data %s' % (method, str(galaxy_hid)) - statistics['dir'] = fpath + statistics["fname"] = "%s: data %s" % (method, str(galaxy_hid)) + statistics["dir"] = fpath if threshold >= 1: - filter_string = 'fa%d' % threshold + filter_string = "fa%d" % threshold else: - filter_string = 'f%d' % (threshold * 100) - fname = '%s_%su%dd%d_on_data_%s' % (method, filter_string, up_distance, down_distance, galaxy_hid) + filter_string = "f%d" % (threshold * 100) + fname = "%s_%su%dd%d_on_data_%s" % ( + method, + filter_string, + up_distance, + down_distance, + galaxy_hid, + ) def make_histogram_path(output_type, fname): - return os.path.join(HISTOGRAM, 'histogram_%s_%s.%s' % (output_type, fname, PLOT_FORMAT)) + return os.path.join( + HISTOGRAM, "histogram_%s_%s.%s" % (output_type, fname, PLOT_FORMAT) + ) def make_path(output_type, extension, fname): # Returns the full path for an output. - return os.path.join(output_type, '%s_%s.%s' % (output_type, fname, extension)) + return os.path.join(output_type, "%s_%s.%s" % (output_type, fname, extension)) def td_writer(output_type, extension, fname): # Returns a tab-delimited writer for a specified output. output_file_path = make_path(output_type, extension, fname) - return csv.writer(open(output_file_path, 'wt'), delimiter='\t', lineterminator="\n") + return csv.writer( + open(output_file_path, "wt"), delimiter="\t", lineterminator="\n" + ) - with open(dataset_path, 'rt') as input: + with open(dataset_path, "rt") as input: try: chromosomes = parse_chromosomes(input) except Exception: - stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc())) + stop_err( + 'Unable to parse file "%s".\n%s' + % (dataset_path, traceback.format_exc()) + ) if output_details: # Details - detailed_output = td_writer('data_%s' % DETAILS, TABULAR_EXT, fname) - detailed_output.writerow(('chrom', 'start', 'end', 'value', 'strand') * 2 + ('midpoint', 'c-w reads sum', 'c-w distance (bp)')) + detailed_output = td_writer("data_%s" % DETAILS, TABULAR_EXT, fname) + detailed_output.writerow( + ("chrom", "start", "end", "value", "strand") * 2 + + ("midpoint", "c-w reads sum", "c-w distance (bp)") + ) if output_plots: # Final Plot final_plot_path = make_histogram_path(FINAL_PLOTS, fname) if output_orphans: # Orphans - orphan_output = td_writer('data_%s' % ORPHANS, TABULAR_EXT, fname) - orphan_output.writerow(('chrom', 'strand', 'start', 'end', 'value')) + orphan_output = td_writer("data_%s" % ORPHANS, TABULAR_EXT, fname) + orphan_output.writerow(("chrom", "strand", "start", "end", "value")) if output_plots: # Preview Plot preview_plot_path = make_histogram_path(PREVIEW_PLOTS, fname) # Matched Pairs. - matched_pairs_output = td_writer('data_%s' % MATCHED_PAIRS, GFF_EXT, fname) - statistics['stats_path'] = 'statistics.%s' % TABULAR_EXT + matched_pairs_output = td_writer("data_%s" % MATCHED_PAIRS, GFF_EXT, fname) + statistics["stats_path"] = "statistics.%s" % TABULAR_EXT if output_plots: - statistics['graph_path'] = make_histogram_path(STATS_GRAPH, fname) - statistics['perc95'] = perc95(chromosomes) + statistics["graph_path"] = make_histogram_path(STATS_GRAPH, fname) + statistics["perc95"] = perc95(chromosomes) if threshold > 0: # Apply peak_filter peak_filter(chromosomes, threshold) - if method == 'mode': + if method == "mode": freq = all_pair_distribution(chromosomes, up_distance, down_distance, binsize) mode = freq.mode() - statistics['preview_mode'] = mode + statistics["preview_mode"] = mode if output_plots: - frequency_plot([freq], preview_plot_path, title='Preview frequency plot') + frequency_plot([freq], preview_plot_path, title="Preview frequency plot") else: - statistics['preview_mode'] = 'NA' + statistics["preview_mode"] = "NA" dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize) orphans = 0 # x will be used to archive the summary dataset @@ -334,7 +383,7 @@ def td_writer(output_type, extension, fname): keys = make_keys(crick) for peak in watson: window = get_window(crick, peak, up_distance, down_distance, keys) - if method == 'mode': + if method == "mode": match = match_mode(window, peak, mode) else: match = METHODS[method](window, peak) @@ -343,25 +392,34 @@ def td_writer(output_type, extension, fname): d = distance(peak, match) dist.add(d) # Simple output in gff format. - x.append(gff_row(cname, - source='cwpair', - start=midpoint, - end=midpoint + 1, - score=peak[3] + match[3], - attrs={'cw_distance': d})) + x.append( + gff_row( + cname, + source="cwpair", + start=midpoint, + end=midpoint + 1, + score=peak[3] + match[3], + attrs={"cw_distance": d}, + ) + ) if output_details: - detailed_output.writerow((cname, - peak[1], - peak[2], - peak[3], - '+', - cname, - match[1], - match[2], - match[3], '-', - midpoint, - peak[3] + match[3], - d)) + detailed_output.writerow( + ( + cname, + peak[1], + peak[2], + peak[3], + "+", + cname, + match[1], + match[2], + match[3], + "-", + midpoint, + peak[3] + match[3], + d, + ) + ) i = bisect.bisect_left(keys, (match[1] + match[2]) / 2) del crick[i] del keys[i] @@ -384,19 +442,19 @@ def td_writer(output_type, extension, fname): # Dataset in tuple cannot be modified in Python, so row will # be converted to list format to add 'chr'. if row_tmp[0] == "999": - row_tmp[0] = 'chrM' + row_tmp[0] = "chrM" elif row_tmp[0] == "998": - row_tmp[0] = 'chrY' + row_tmp[0] = "chrY" elif row_tmp[0] == "997": - row_tmp[0] = 'chrX' + row_tmp[0] = "chrX" else: row_tmp[0] = row_tmp[0] # Print row_tmp. matched_pairs_output.writerow(row_tmp) - statistics['paired'] = dist.size() * 2 - statistics['orphans'] = orphans - statistics['final_mode'] = dist.mode() + statistics["paired"] = dist.size() * 2 + statistics["orphans"] = orphans + statistics["final_mode"] = dist.mode() if output_plots: - frequency_plot([dist], final_plot_path, title='Frequency distribution') - statistics['dist'] = dist + frequency_plot([dist], final_plot_path, title="Frequency distribution") + statistics["dist"] = dist return statistics diff --git a/tools/ebi_tools/ebeye_urllib.py b/tools/ebi_tools/ebeye_urllib.py index 7f1b5eb8cda..c7e3207b399 100644 --- a/tools/ebi_tools/ebeye_urllib.py +++ b/tools/ebi_tools/ebeye_urllib.py @@ -22,7 +22,7 @@ from xmltramp2 import xmltramp # Service base URL -baseUrl = 'https://www.ebi.ac.uk/ebisearch/ws/rest' +baseUrl = "https://www.ebi.ac.uk/ebisearch/ws/rest" # Debug level debugLevel = 0 @@ -30,66 +30,63 @@ # Debug print def printDebugMessage(functionName, message, level): - if(level <= debugLevel): - print('[' + functionName + '] ' + message) + if level <= debugLevel: + print("[" + functionName + "] " + message) # User-agent for request. def getUserAgent(): - printDebugMessage('getUserAgent', 'Begin', 11) - urllib_agent = 'Python-urllib/%s' % platform.python_version() - clientRevision = '$Revision: 2468 $' - clientVersion = '0' + printDebugMessage("getUserAgent", "Begin", 11) + urllib_agent = "Python-urllib/%s" % platform.python_version() + clientRevision = "$Revision: 2468 $" + clientVersion = "0" if len(clientRevision) > 11: clientVersion = clientRevision[11:-2] - user_agent = 'EBI-Sample-Client/%s (%s; Python %s; %s) %s' % ( - clientVersion, os.path.basename(__file__), - platform.python_version(), platform.system(), - urllib_agent + user_agent = "EBI-Sample-Client/%s (%s; Python %s; %s) %s" % ( + clientVersion, + os.path.basename(__file__), + platform.python_version(), + platform.system(), + urllib_agent, ) - printDebugMessage('getUserAgent', 'user_agent: %s' % user_agent, 12) - printDebugMessage('getUserAgent', 'End', 11) + printDebugMessage("getUserAgent", "user_agent: %s" % user_agent, 12) + printDebugMessage("getUserAgent", "End", 11) return user_agent # Wrapper for a REST (HTTP GET) request def restRequest(url): - printDebugMessage('restRequest', 'Begin', 11) - printDebugMessage('restRequest', 'url: %s' % url, 11) + printDebugMessage("restRequest", "Begin", 11) + printDebugMessage("restRequest", "url: %s" % url, 11) url = quote(url, safe="%/:=&?~#+!$,;'@()*[]") try: user_agent = getUserAgent() - http_headers = { - 'User-Agent': user_agent, - 'Accept-Encoding': 'gzip' - } + http_headers = {"User-Agent": user_agent, "Accept-Encoding": "gzip"} req = Request(url, None, http_headers) resp = urlopen(req) - encoding = resp.info().get('Content-Encoding') + encoding = resp.info().get("Content-Encoding") result = None - if encoding is None or encoding == 'identity': - result = text_type(resp.read(), 'utf-8') - elif encoding == 'gzip': + if encoding is None or encoding == "identity": + result = text_type(resp.read(), "utf-8") + elif encoding == "gzip": result = resp.read() - printDebugMessage('restRequest', 'result: %s' % result, 21) - gz = GzipFile( - fileobj=BytesIO(result), - mode="r") - result = text_type(gz.read(), 'utf-8') + printDebugMessage("restRequest", "result: %s" % result, 21) + gz = GzipFile(fileobj=BytesIO(result), mode="r") + result = text_type(gz.read(), "utf-8") else: - raise Exception('Unsupported Content-Encoding') + raise Exception("Unsupported Content-Encoding") resp.close() except HTTPError as ex: raise ex - printDebugMessage('restRequest', 'result: %s' % result, 11) - printDebugMessage('restRequest', 'End', 11) + printDebugMessage("restRequest", "result: %s" % result, 11) + printDebugMessage("restRequest", "End", 11) return result def hasSubdomains(domainInfo): for dir in domainInfo._dir: - if dir._name == 'subdomains': + if dir._name == "subdomains": return True return False @@ -99,30 +96,31 @@ def extractUsefulFields(fieldInfos): retrievable = [] for fieldInfo in fieldInfos: - if fieldInfo('id') == "$facets": + if fieldInfo("id") == "$facets": continue - options = fieldInfo['options']['option':] + options = fieldInfo["options"]["option":] for option in options: if option("name") == "searchable" and str(option) == "true": - searchable.append(fieldInfo('id')) + searchable.append(fieldInfo("id")) if option("name") == "retrievable" and str(option) == "true": - retrievable.append(fieldInfo('id')) + retrievable.append(fieldInfo("id")) return searchable, retrievable def extractLowerLevelDomains(domainInfo, domains): if hasSubdomains(domainInfo): - subdomains = domainInfo['subdomains']['domain':] + subdomains = domainInfo["subdomains"]["domain":] for subdomain in subdomains: - domains = extractLowerLevelDomains( subdomain, domains) + domains = extractLowerLevelDomains(subdomain, domains) else: searchable, retrievable = extractUsefulFields( - domainInfo['fieldInfos']['fieldInfo':]) + domainInfo["fieldInfos"]["fieldInfo":] + ) - domain_id = domainInfo('id') + domain_id = domainInfo("id") domains.setdefault(domain_id, {}) - domains[domain_id]["name"] = domainInfo('name') + domains[domain_id]["name"] = domainInfo("name") domains[domain_id]["searchable_fields"] = sorted(searchable) domains[domain_id]["retrievable_fields"] = sorted(retrievable) return domains @@ -130,18 +128,18 @@ def extractLowerLevelDomains(domainInfo, domains): # Get domain Hierarchy def getDomainHierarchy(): - requestUrl = baseUrl + '/allebi' + requestUrl = baseUrl + "/allebi" xmlDoc = restRequest(requestUrl) doc = xmltramp.parse(xmlDoc) - allebi = doc['domains']['domain'] + allebi = doc["domains"]["domain"] lower_level_domains = extractLowerLevelDomains(allebi, {}) - printDebugMessage('getDomainHierarchy', 'End', 1) + printDebugMessage("getDomainHierarchy", "End", 1) return lower_level_domains # Check if a databaseInfo matches a database name. def is_database(dbInfo, dbName): - printDebugMessage('is_database', 'Begin', 11) + printDebugMessage("is_database", "Begin", 11) retVal = False if str(dbInfo.name) == dbName: retVal = True @@ -149,27 +147,27 @@ def is_database(dbInfo, dbName): for dbAlias in dbInfo.aliasList: if str(dbAlias) == dbName: retVal = True - printDebugMessage('is_database', 'retVal: %s' % retVal, 11) - printDebugMessage('is_database', 'End', 11) + printDebugMessage("is_database", "retVal: %s" % retVal, 11) + printDebugMessage("is_database", "End", 11) return retVal # Get number of results def getNumberOfResults(domain, query): - printDebugMessage('getNumberOfResults', 'Begin', 1) - requestUrl = baseUrl + '/' + domain + '?query=' + query + '&size=0' - printDebugMessage('getNumberOfResults', requestUrl, 2) + printDebugMessage("getNumberOfResults", "Begin", 1) + requestUrl = baseUrl + "/" + domain + "?query=" + query + "&size=0" + printDebugMessage("getNumberOfResults", requestUrl, 2) xmlDoc = restRequest(requestUrl) doc = xmltramp.parse(xmlDoc) - numberOfResults = int(str(doc['hitCount'])) - printDebugMessage('getNumberOfResults', 'End', 1) + numberOfResults = int(str(doc["hitCount"])) + printDebugMessage("getNumberOfResults", "End", 1) return numberOfResults def makeRequest(requestUrl): xmlDoc = restRequest(requestUrl) doc = xmltramp.parse(xmlDoc) - entries = doc['entries']['entry':] + entries = doc["entries"]["entry":] formatted_output = printEntries(entries) return formatted_output @@ -181,21 +179,21 @@ def getResults(domain, query, fields): quotient = numberOfResults // maximum_size start = 0 - printDebugMessage('getResults', 'Begin', 1) + printDebugMessage("getResults", "Begin", 1) request_output = "%s\tlink\n" % (fields.replace(",", "\t")) for i in range(quotient): start = maximum_size * i - requestUrl = baseUrl + '/' + domain + '?query=' + query - requestUrl += '&fields=' + fields + '&size=' + str(maximum_size) - requestUrl += '&start=' + str(start) + '&fieldurl=true' + requestUrl = baseUrl + "/" + domain + "?query=" + query + requestUrl += "&fields=" + fields + "&size=" + str(maximum_size) + requestUrl += "&start=" + str(start) + "&fieldurl=true" request_output += makeRequest(requestUrl) if (numberOfResults % 100) > 0: start = maximum_size * quotient remainder = numberOfResults - start - requestUrl = baseUrl + '/' + domain + '?query=' + query - requestUrl += '&fields=' + fields + '&size=' + str(remainder) - requestUrl += '&start=' + str(start) + '&fieldurl=true' + requestUrl = baseUrl + "/" + domain + "?query=" + query + requestUrl += "&fields=" + fields + "&size=" + str(remainder) + requestUrl += "&start=" + str(start) + "&fieldurl=true" request_output += makeRequest(requestUrl) print(request_output) @@ -203,15 +201,15 @@ def getResults(domain, query, fields): def printEntries(entries): output = "" - printDebugMessage('printEntries', 'Begin', 1) + printDebugMessage("printEntries", "Begin", 1) for entry in entries: sep = "" - for field in entry['fields']['field':]: + for field in entry["fields"]["field":]: output += "%s" % (sep) - fields = field['values']['value':] + fields = field["values"]["value":] if len(fields) > 0: sub_sep = "" - for value in field['values']['value':]: + for value in field["values"]["value":]: output += "%s%s" % (sub_sep, value) sub_sep = "," sep = "\t" @@ -219,53 +217,53 @@ def printEntries(entries): if hasFieldUrls(entry): output += "%s" % (sep) sub_sep = "" - for fieldurl in entry['fieldURLs']['fieldURL':]: + for fieldurl in entry["fieldURLs"]["fieldURL":]: output += "%s%s" % (sub_sep, str(fieldurl)) sub_sep = "," sep = "\t" if hasViewUrls(entry): output += "%s" % (sep) sub_sep = "" - for viewurl in entry['viewURLs']['viewURL':]: + for viewurl in entry["viewURLs"]["viewURL":]: output += "%s%s" % (sub_sep, str(viewurl)) sub_sep = "," output += "\n" - printDebugMessage('printEntries', 'End', 1) + printDebugMessage("printEntries", "End", 1) return output def hasFieldUrls(entry): for dir in entry._dir: - if dir._name == 'fieldURLs': + if dir._name == "fieldURLs": return True return False def hasViewUrls(entry): for dir in entry._dir: - if dir._name == 'viewURLs': + if dir._name == "viewURLs": return True return False def getRunLink(run_id): - printDebugMessage('getEntries', 'Begin', 1) - requestUrl = baseUrl + '/metagenomics_runs/entry/' + run_id + '?fieldurl=true' - printDebugMessage('getEntries', requestUrl, 2) + printDebugMessage("getEntries", "Begin", 1) + requestUrl = baseUrl + "/metagenomics_runs/entry/" + run_id + "?fieldurl=true" + printDebugMessage("getEntries", requestUrl, 2) xmlDoc = restRequest(requestUrl) doc = xmltramp.parse(xmlDoc) - entries = doc['entries']['entry':] - fieldURL = '' + entries = doc["entries"]["entry":] + fieldURL = "" for entry in entries: - for fieldurl in entry['fieldURLs']['fieldURL':]: + for fieldurl in entry["fieldURLs"]["fieldURL":]: fieldURL += str(fieldurl) - printDebugMessage('getEntries', 'End', 1) - p = re.compile('http') - fieldURL = p.sub('https', fieldURL) + printDebugMessage("getEntries", "End", 1) + p = re.compile("http") + fieldURL = p.sub("https", fieldURL) print(fieldURL) -if __name__ == '__main__': +if __name__ == "__main__": # Usage message usage = """ %prog getDomainHierarchy @@ -277,10 +275,7 @@ def getRunLink(run_id): description += "The searching tools are using the EB-eye search engine. " description += "http://www.ebi.ac.uk/ebisearch/" # Process command-line options - parser = OptionParser( - usage=usage, - description=description, - version='1.0') + parser = OptionParser(usage=usage, description=description, version="1.0") (options, args) = parser.parse_args() # No arguments, print usage @@ -288,24 +283,24 @@ def getRunLink(run_id): parser.print_help() # Get domain hierarchy - elif args[0] == 'getDomainHierarchy': + elif args[0] == "getDomainHierarchy": getDomainHierarchy() # Get search results - elif args[0] == 'getResults': + elif args[0] == "getResults": if len(args) < 4: - print('domain, query and fields should be given.') + print("domain, query and fields should be given.") else: getResults(args[1], args[2], args[3]) # Get run link results - elif args[0] == 'getRunLink': + elif args[0] == "getRunLink": if len(args) < 2: - print('run id should be given.') + print("run id should be given.") else: getRunLink(args[1]) # Unknown argument combination, display usage else: - print('Error: unrecognised argument combination') + print("Error: unrecognised argument combination") parser.print_help() diff --git a/tools/extract_genomic_dna/extract_genomic_dna.py b/tools/extract_genomic_dna/extract_genomic_dna.py index 67efa7482cf..01a44925481 100644 --- a/tools/extract_genomic_dna/extract_genomic_dna.py +++ b/tools/extract_genomic_dna/extract_genomic_dna.py @@ -11,24 +11,47 @@ import extract_genomic_dna_utils as egdu # noqa: I100,I202 parser = argparse.ArgumentParser() -parser.add_argument('--input_format', dest='input_format', help="Input dataset format") -parser.add_argument('--input', dest='input', help="Input dataset") -parser.add_argument('--genome', dest='genome', help="Input dataset genome build") -parser.add_argument('--interpret_features', dest='interpret_features', default=None, help="Interpret features if input format is gff") -parser.add_argument('--columns', dest='columns', help="Columns to use in input file") -parser.add_argument('--reference_genome_source', dest='reference_genome_source', help="Source of reference genome file") -parser.add_argument('--reference_genome', dest='reference_genome', help="Reference genome file") -parser.add_argument('--output_format', dest='output_format', help="Output format") -parser.add_argument('--fasta_header_type', dest='fasta_header_type', default=None, help="Fasta header format") -parser.add_argument('--fasta_header_delimiter', dest='fasta_header_delimiter', default=None, help="Fasta header field delimiter") -parser.add_argument('--output', dest='output', help="Output dataset") +parser.add_argument("--input_format", dest="input_format", help="Input dataset format") +parser.add_argument("--input", dest="input", help="Input dataset") +parser.add_argument("--genome", dest="genome", help="Input dataset genome build") +parser.add_argument( + "--interpret_features", + dest="interpret_features", + default=None, + help="Interpret features if input format is gff", +) +parser.add_argument("--columns", dest="columns", help="Columns to use in input file") +parser.add_argument( + "--reference_genome_source", + dest="reference_genome_source", + help="Source of reference genome file", +) +parser.add_argument( + "--reference_genome", dest="reference_genome", help="Reference genome file" +) +parser.add_argument("--output_format", dest="output_format", help="Output format") +parser.add_argument( + "--fasta_header_type", + dest="fasta_header_type", + default=None, + help="Fasta header format", +) +parser.add_argument( + "--fasta_header_delimiter", + dest="fasta_header_delimiter", + default=None, + help="Fasta header field delimiter", +) +parser.add_argument("--output", dest="output", help="Output dataset") args = parser.parse_args() -input_is_gff = args.input_format == 'gff' +input_is_gff = args.input_format == "gff" interpret_features = input_is_gff and args.interpret_features == "yes" -if len(args.columns.split(',')) == 5: +if len(args.columns.split(",")) == 5: # Bed file. - chrom_col, start_col, end_col, strand_col, name_col = egdu.parse_cols_arg(args.columns) + chrom_col, start_col, end_col, strand_col, name_col = egdu.parse_cols_arg( + args.columns + ) else: # Gff file. chrom_col, start_col, end_col, strand_col = egdu.parse_cols_arg(args.columns) @@ -47,13 +70,13 @@ first_invalid_line = 0 invalid_lines = [] warnings = [] -warning = '' +warning = "" twobitfile = None line_count = 1 file_iterator = open(args.input) if interpret_features: file_iterator = egdu.GFFReaderWrapper(file_iterator, fix_strand=False) -out = open(args.output, 'wt') +out = open(args.output, "wt") for feature in file_iterator: # Ignore comments, headers. @@ -70,9 +93,9 @@ strand = feature.strand else: # Processing lines, either interval or GFF format. - line = feature.rstrip('\r\n') + line = feature.rstrip("\r\n") if line and not line.startswith("#"): - fields = line.split('\t') + fields = line.split("\t") try: chrom = fields[chrom_col] start = int(fields[start_col]) @@ -99,9 +122,9 @@ first_invalid_line = line_count skipped_lines += len(invalid_lines) continue - if strand not in ['+', '-']: - strand = '+' - sequence = '' + if strand not in ["+", "-"]: + strand = "+" + sequence = "" else: continue # Open sequence file and get sequence for feature/interval. @@ -109,11 +132,16 @@ if chrom in nibs: nib = nibs[chrom] else: - nibs[chrom] = nib = bx.seq.nib.NibFile(open("%s/%s.nib" % (seq_path, chrom))) + nibs[chrom] = nib = bx.seq.nib.NibFile( + open("%s/%s.nib" % (seq_path, chrom)) + ) try: sequence = nib.get(start, end - start) except Exception: - warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.genome) + warning = ( + "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " + % (start, end - start, args.genome) + ) warnings.append(warning) if not invalid_lines: invalid_lines = egdu.get_lines(feature) @@ -121,18 +149,23 @@ skipped_lines += len(invalid_lines) continue elif os.path.isfile(seq_path): - if not(twobitfile): + if not (twobitfile): twobitfile = bx.seq.twobit.TwoBitFile(open(seq_path)) try: if interpret_features: # Create sequence from intervals within a feature. - sequence = '' + sequence = "" for interval in feature.intervals: - sequence += twobitfile[interval.chrom][interval.start:interval.end] + sequence += twobitfile[interval.chrom][ + interval.start: interval.end + ] else: sequence = twobitfile[chrom][start:end] except Exception: - warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % (start, end - start, chrom) + warning = ( + "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " + % (start, end - start, chrom) + ) warnings.append(warning) if not invalid_lines: invalid_lines = egdu.get_lines(feature) @@ -140,15 +173,21 @@ skipped_lines += len(invalid_lines) continue else: - warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.genome) + warning = "Chromosome by name '%s' was not found for build '%s'. " % ( + chrom, + args.genome, + ) warnings.append(warning) if not invalid_lines: invalid_lines = egdu.get_lines(feature) first_invalid_line = line_count skipped_lines += len(invalid_lines) continue - if sequence == '': - warning = "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " % (chrom, start, end, args.genome) + if sequence == "": + warning = ( + "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " + % (chrom, start, end, args.genome) + ) warnings.append(warning) if not invalid_lines: invalid_lines = egdu.get_lines(feature) @@ -161,15 +200,18 @@ if input_is_gff: start, end = egdu.convert_bed_coords_to_gff([start, end]) if args.fasta_header_type == "bedtools_getfasta_default": - out.write(">%s\n" % egdu.get_bedtools_getfasta_default_header(str(chrom), - str(start), - str(end), - strand, - includes_strand_col)) + out.write( + ">%s\n" + % egdu.get_bedtools_getfasta_default_header( + str(chrom), str(start), str(end), strand, includes_strand_col + ) + ) else: # args.fasta_header_type == "char_delimited": fields = [args.genome, str(chrom), str(start), str(end), strand] - field_delimiter = egdu.get_fasta_header_delimiter(args.fasta_header_delimiter) + field_delimiter = egdu.get_fasta_header_delimiter( + args.fasta_header_delimiter + ) meta_data = field_delimiter.join(fields) if name.strip(): out.write(">%s %s\n" % (meta_data, name)) @@ -184,20 +226,24 @@ else: # output_format == "interval". if interpret_features: - meta_data = "\t".join([feature.chrom, - "galaxy_extract_genomic_dna", - "interval", - str(feature.start), - str(feature.end), - feature.score, - feature.strand, - ".", - egdu.gff_attributes_to_str(feature.attributes, "GTF")]) + meta_data = "\t".join( + [ + feature.chrom, + "galaxy_extract_genomic_dna", + "interval", + str(feature.start), + str(feature.end), + feature.score, + feature.strand, + ".", + egdu.gff_attributes_to_str(feature.attributes, "GTF"), + ] + ) else: # Here fields was set up around line 73. meta_data = "\t".join(fields) if input_is_gff: - format_str = "%s seq \"%s\";\n" + format_str = '%s seq "%s";\n' else: format_str = "%s\t%s\n" out.write(format_str % (meta_data, str(sequence))) @@ -214,7 +260,10 @@ print(warn_msg) if skipped_lines: # Error message includes up to the first 10 skipped lines. - print('Skipped %d invalid lines, 1st is #%d, "%s"' % (skipped_lines, first_invalid_line, '\n'.join(invalid_lines[:10]))) + print( + 'Skipped %d invalid lines, 1st is #%d, "%s"' + % (skipped_lines, first_invalid_line, "\n".join(invalid_lines[:10])) + ) if args.reference_genome_source == "history": os.remove(seq_path) diff --git a/tools/repmatch_gff3/repmatch_gff3_util.py b/tools/repmatch_gff3/repmatch_gff3_util.py index 4d765c37775..c2d2ba75243 100644 --- a/tools/repmatch_gff3/repmatch_gff3_util.py +++ b/tools/repmatch_gff3/repmatch_gff3_util.py @@ -230,18 +230,18 @@ def frequency_histogram(freqs, dataset_path, labels=[], title=''): pyplot.ylabel(Y_LABEL) pyplot.subplots_adjust(left=ADJUST[0], right=ADJUST[1], top=ADJUST[2], bottom=ADJUST[3]) ax = pyplot.gca() - for l in ax.get_xticklines() + ax.get_yticklines(): - l.set_markeredgewidth(TICK_WIDTH) + for line in ax.get_xticklines() + ax.get_yticklines(): + line.set_markeredgewidth(TICK_WIDTH) pyplot.savefig(dataset_path) METHODS = {'closest': match_closest, 'largest': match_largest} -def gff_attrs(l): - if len(l) == 0: +def gff_attrs(attrs): + if len(attrs) == 0: return '.' - return ';'.join('%s=%s' % (tup[0], tup[1]) for tup in l) + return ';'.join('%s=%s' % (tup[0], tup[1]) for tup in attrs) def parse_gff_attrs(s): diff --git a/tools/tn93/tn93_cluster.py b/tools/tn93/tn93_cluster.py index 2e177ab8831..ebb2ae51cc9 100755 --- a/tools/tn93/tn93_cluster.py +++ b/tools/tn93/tn93_cluster.py @@ -12,29 +12,34 @@ def cluster_to_fasta(json_file, fasta_file, reference_name=None): with open(fasta_file, "w") as fh2: for c in cluster_json: if reference_name is not None: - if reference_name in c['members']: - cc = c['centroid'].split('\n') + if reference_name in c["members"]: + cc = c["centroid"].split("\n") cc[0] = ">" + reference_name print("\n".join(cc), file=fh2) continue - print(c['centroid'], file=fh2) + print(c["centroid"], file=fh2) - return(os.path.getmtime(fasta_file), len(cluster_json)) + return (os.path.getmtime(fasta_file), len(cluster_json)) def run_command(command): - proc = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + proc = subprocess.Popen( + shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) stdout, stderr = proc.communicate() result = proc.returncode if result != 0: - print('Command `%s` failed with exit code %s\n' % (command, result), file=sys.stderr) - print('--------------------- STDOUT ---------------------') - print(stdout.decode().replace('\\n', '\n')) - print('------------------- END STDOUT -------------------') - print('--------------------- STDERR ---------------------', file=sys.stderr) - print(stderr.decode().replace('\\n', '\n'), file=sys.stderr) - print('------------------- END STDERR -------------------', file=sys.stderr) - return(int(result)) + print( + "Command `%s` failed with exit code %s\n" % (command, result), + file=sys.stderr, + ) + print("--------------------- STDOUT ---------------------") + print(stdout.decode().replace("\\n", "\n")) + print("------------------- END STDOUT -------------------") + print("--------------------- STDERR ---------------------", file=sys.stderr) + print(stderr.decode().replace("\\n", "\n"), file=sys.stderr) + print("------------------- END STDERR -------------------", file=sys.stderr) + return int(result) def main(arguments): @@ -42,34 +47,55 @@ def main(arguments): step = threshold * 0.25 with open(arguments.reference) as fh: for line in fh: - if line[0] == '>': - _ref_seq_name = line[1:].split(' ')[0].strip() + if line[0] == ">": + _ref_seq_name = line[1:].split(" ")[0].strip() break while threshold <= 1: - command = 'tn93-cluster -o clusters.json -t %g -a %s -c %s -m json -l %d -g %f %s' % (threshold, arguments.ambigs, arguments.cluster_type, arguments.overlap, arguments.fraction, arguments.input) + command = ( + "tn93-cluster -o clusters.json -t %g -a %s -c %s -m json -l %d -g %f %s" + % ( + threshold, + arguments.ambigs, + arguments.cluster_type, + arguments.overlap, + arguments.fraction, + arguments.input, + ) + ) return_code = run_command(command) if return_code != 0: return return_code - input_stamp, cluster_count = cluster_to_fasta('clusters.json', 'clusters.fa', _ref_seq_name) + input_stamp, cluster_count = cluster_to_fasta( + "clusters.json", "clusters.fa", _ref_seq_name + ) if cluster_count <= arguments.cluster_count: break else: threshold += step - print('Found %d clusters at threshold %f' % (cluster_count, threshold)) + print("Found %d clusters at threshold %f" % (cluster_count, threshold)) return 0 -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Combine alignments into a single file, adding a reference sequence as well') - parser.add_argument('--input', help='Input MSA', required=True, type=str) - parser.add_argument('--reference', help='Reference sequence', required=True, type=str) - parser.add_argument('--output', help='Input MSA', required=True, type=str) - parser.add_argument('--threshold', help='Threshold', required=True, type=float) - parser.add_argument('--ambigs', help='Handle ambigs', required=True, type=str) - parser.add_argument('--cluster-type', help='Cluster type', required=True, type=str) - parser.add_argument('--overlap', help='Overlap', required=True, type=int) - parser.add_argument('--fraction', help='Fraction', required=True, type=float) - parser.add_argument('--cluster-count', help='Max query', required=True, type=int) - parser.add_argument('--compressed', help='File to write compressed clusters to', required=True, type=str) +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Combine alignments into a single file, adding a reference sequence as well" + ) + parser.add_argument("--input", help="Input MSA", required=True, type=str) + parser.add_argument( + "--reference", help="Reference sequence", required=True, type=str + ) + parser.add_argument("--output", help="Input MSA", required=True, type=str) + parser.add_argument("--threshold", help="Threshold", required=True, type=float) + parser.add_argument("--ambigs", help="Handle ambigs", required=True, type=str) + parser.add_argument("--cluster-type", help="Cluster type", required=True, type=str) + parser.add_argument("--overlap", help="Overlap", required=True, type=int) + parser.add_argument("--fraction", help="Fraction", required=True, type=float) + parser.add_argument("--cluster-count", help="Max query", required=True, type=int) + parser.add_argument( + "--compressed", + help="File to write compressed clusters to", + required=True, + type=str, + ) arguments = parser.parse_args() exit(main(arguments)) diff --git a/tools/vsnp/vsnp_determine_ref_from_data.py b/tools/vsnp/vsnp_determine_ref_from_data.py index 8261c7fe09c..4f118ae97d9 100755 --- a/tools/vsnp/vsnp_determine_ref_from_data.py +++ b/tools/vsnp/vsnp_determine_ref_from_data.py @@ -8,8 +8,8 @@ import yaml from Bio.SeqIO.QualityIO import FastqGeneralIterator -OUTPUT_DBKEY_DIR = 'output_dbkey' -OUTPUT_METRICS_DIR = 'output_metrics' +OUTPUT_DBKEY_DIR = "output_dbkey" +OUTPUT_METRICS_DIR = "output_metrics" def get_sample_name(file_path): @@ -66,7 +66,15 @@ def get_dnaprints_dict(dnaprint_fields): return dnaprints_dict -def get_group_and_dbkey(dnaprints_dict, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum): +def get_group_and_dbkey( + dnaprints_dict, + brucella_string, + brucella_sum, + bovis_string, + bovis_sum, + para_string, + para_sum, +): if brucella_sum > 3: group = "Brucella" dbkey = get_dbkey(dnaprints_dict, "brucella", brucella_string) @@ -118,14 +126,14 @@ def get_seq_counts(value, fastq_list, gzipped): count = 0 for fastq_file in fastq_list: if gzipped: - with gzip.open(fastq_file, 'rt') as fh: + with gzip.open(fastq_file, "rt") as fh: for title, seq, qual in FastqGeneralIterator(fh): count += seq.count(value) else: - with open(fastq_file, 'r') as fh: + with open(fastq_file, "r") as fh: for title, seq, qual in FastqGeneralIterator(fh): count += seq.count(value) - return(value, count) + return (value, count) def get_species_counts(fastq_list, gzipped): @@ -157,11 +165,11 @@ def get_species_strings(count_summary): for v in binary_dictionary.values(): binary_list.append(v) brucella_binary = binary_list[:16] - brucella_string = ''.join(str(e) for e in brucella_binary) + brucella_string = "".join(str(e) for e in brucella_binary) bovis_binary = binary_list[16:24] - bovis_string = ''.join(str(e) for e in bovis_binary) + bovis_string = "".join(str(e) for e in bovis_binary) para_binary = binary_list[24:] - para_string = ''.join(str(e) for e in para_binary) + para_string = "".join(str(e) for e in para_binary) return brucella_string, bovis_string, para_string @@ -194,15 +202,42 @@ def output_metrics(file_name, count_list, group, dbkey, output_file): fh.write("\ndbkey: %s\n" % dbkey) -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--dnaprint_fields', action='append', dest='dnaprint_fields', nargs=2, help="List of dnaprints data table value, name and path fields") - parser.add_argument('--read1', action='store', dest='read1', help='Required: single read') - parser.add_argument('--read2', action='store', dest='read2', required=False, default=None, help='Optional: paired read') - parser.add_argument('--gzipped', action='store_true', dest='gzipped', help='Input files are gzipped') - parser.add_argument('--output_dbkey', action='store', dest='output_dbkey', help='Output reference file') - parser.add_argument('--output_metrics', action='store', dest='output_metrics', help='Output metrics file') + parser.add_argument( + "--dnaprint_fields", + action="append", + dest="dnaprint_fields", + nargs=2, + help="List of dnaprints data table value, name and path fields", + ) + parser.add_argument( + "--read1", action="store", dest="read1", help="Required: single read" + ) + parser.add_argument( + "--read2", + action="store", + dest="read2", + required=False, + default=None, + help="Optional: paired read", + ) + parser.add_argument( + "--gzipped", action="store_true", dest="gzipped", help="Input files are gzipped" + ) + parser.add_argument( + "--output_dbkey", + action="store", + dest="output_dbkey", + help="Output reference file", + ) + parser.add_argument( + "--output_metrics", + action="store", + dest="output_metrics", + help="Output metrics file", + ) args = parser.parse_args() @@ -219,7 +254,24 @@ def output_metrics(file_name, count_list, group, dbkey, output_file): # Here fastq_list consists of either a single read # or a set of paired reads, producing single outputs. - count_summary, count_list, brucella_sum, bovis_sum, para_sum = get_species_counts(fastq_list, args.gzipped) + count_summary, count_list, brucella_sum, bovis_sum, para_sum = get_species_counts( + fastq_list, args.gzipped + ) brucella_string, bovis_string, para_string = get_species_strings(count_summary) - group, dbkey = get_group_and_dbkey(dnaprints_dict, brucella_string, brucella_sum, bovis_string, bovis_sum, para_string, para_sum) - output_files(args.read1, count_list, group, dbkey, dbkey_file=args.output_dbkey, metrics_file=args.output_metrics) + group, dbkey = get_group_and_dbkey( + dnaprints_dict, + brucella_string, + brucella_sum, + bovis_string, + bovis_sum, + para_string, + para_sum, + ) + output_files( + args.read1, + count_list, + group, + dbkey, + dbkey_file=args.output_dbkey, + metrics_file=args.output_metrics, + )