pubMap

#!/usr/bin/env python2

# first load the standard libraries from python
# we require at least python 2.7
from __future__ import print_function
import sys
if sys.version_info[0]==2 and not sys.version_info[1]>=7:
    print("Sorry, this program requires at least python 2.7")
    print("You can download a more current python version from python.org and compile it")
    print("into your homedir (or anywhere else) with 'configure --prefix ~/python27'; make;")
    print("then run this program again by specifying your own python executable like this: ")
    print("   ~/python27/bin/python <%s>" % sys.argv[0])
    print("or add ~/python27/bin to your PATH before /usr/bin")
    exit(1)

# load default python packages
import logging, optparse, os, atexit, datetime
from os.path import *

# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir  = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)

# now load our own libraries
import pubConf, pubGeneric, pubAlg, maxCommon, tabfile, pubMap, pubMapProp

# GLOBAL: filename of lockfile
lockFname = None

# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser(
"""usage: %prog [options] <publisher> <step> - pipeline to map fulltext files to genome locations and 
create tracks for genome browser

If target directory already exists, will only process updateIds which have not been processed yet

pubMap requires some external data:
- UCSC genome sequences as 2bit, found via pubConf.GBCONF which points to genbank.conf
- optional: genome loci with an assignment genomic range <-> entrez gene Id &
  symbol, created by pubPrepCdnaDir
- optional: cdna sequences created by pubPrepCdnaDir
- optional: genome sequences not part of genbank.conf as 2bit, found via
  pubConf.nonUcscGenomesDir

STEPS FOR SCANNING TEXT FOR SEQUENCES:

"annot": extract DNA/protein seqs and accessions from text files 
(annotators: scripts/dnaFinder.py, scripts/protFinder.py, scripts/markerFinder.py)
   example:
     pubMap pmc annot

optional:
"annotMarkerDbg": runs only the accession finder, for testing
"annotSeqDbg": runs only the DNA/protein finders, for testing

STEPS FOR BLATTING ONTO GENOMES/CDNA/PROT:

(use pubPrepCdnaDir to build the directory with mRNA sequences)

"filter": remove duplicate and very short DNA/Prot sequences, convert to fasta files
   one fasta file per genome
   input: <baseDir>/annot, output: <baseDir>/blatGenome/seq and <baseDir>/blatProt/seq
   example: pubMap pmc filter 

"blat": submit blat jobs of DNA against genomes, cdna, proteins, 
   separates job into short/long sequences
   output: <baseDir>/blat/{genome,cdna,prot}
   example: pubMap pmc blat

"sort": sort all blat output files, map cdna/prot to genome, run pslCDnaFilter
   example: pubMap pmc sort

"chain": merge cDNA, genome and protein DNA psl from all species into one file, split and
    chain psls and convert to a raw version of bed (for debugging as custom track)
    input: <baseDir>/{blatGenome,blatCdna}/sortedPsl/ 
    output: <baseDir>/blatGenome/bed
    example:
    pubMap pmc chain

STEPS FOR LOADING RESULTS INTO GENOME BROWSER:

"identifiers": run a map/reduce job to retrieve
    the names of files for all fileIds and write to 
    <outDir>/files.tab

"tables": create tables for hgLoadxxx
   - reads in articleIds.lst from <baseDir>/markers
   - reads impacts, article descriptions and article classification results
   - reads sequence info from <outDir>/seqs
   - uses textDir.conf to find the fulltext files and parse
     out basic article information
   - writes article and seq info to tab-files in <outDir>/tables/
   example: pubMap pmc tables

"load": load all tables with hgLoadSql/hgLoadBed into browser
   example:
   pubMap pmc,elsevier load 

   Will process all of <outDir> tables and <outDir>/updates/<i>/tables
   Note that you need to supply --loadFinal if you really want to
   write to the real hgwdev tables.

   This command will use the table hgFixed.pubsLoadedFiles to keep
   track of already loaded files. It will not load files that have already
   been loaded into mysql before.

OTHER COMMANDS:

"<type>": run a list of the steps described above.
    <type> can be either "all" or a range, like "annot-load"
    "all" is the same as "annot-tables"

dropAll: remove all tables (by default the pubsDev... tables)
switchOver: move all tables from the pubsDev to the pubs prefix
expFa: export all sequences as fasta files to pubConf.faDir
expCdr3: export all CDR3-like prot seqs as fasta and tab files to pubConf.cdr3Dir

""")
parser.add_option("-s", "--skipConvert", dest="skipConvert", action="store_true", help="do not convert sequences or convert to fasta or sort/chain, only do the second part of the processing for blat, blatCdna or chain. Used for debugging.", default=False)
parser.add_option("-o", "--outDir", dest="outDir", action="store", help="overwrite the default output directory from pubConf, default %default", default=pubConf.pubMapBaseDir)
parser.add_option("-l", "--activateLog", dest="activateLog", action="store_true", help="write logging to pubMap.<date>.log", default=False)
parser.add_option("", "--onlyDb", dest="onlyDb", action="append", help="only use certain genomes for blatting, can be specified several times")
parser.add_option("", "--tablePrefix", dest="tablePrefix", action="store", help="by default, the 'load' step will load into tables with the pubsDev prefix. If you want to load into final tables, set this to ''. Default value %default. The prefix 'pubs' is always added to the prefix.", default="Dev")
pubGeneric.addGeneralOptions(parser)
(options, args) = parser.parse_args()

# ==== FUNCTIONs =====
def main(args, options):

    if options.debug:
        pubMap.DEBUG=True

    dataset, command = args

    allSteps = ["annot", "annotMarkerDbg", "annotSeqDbg", "filter", "blat", "sort",
        "chain", "identifiers", "tables", "load"]

    if "-" in command:
        stepFrom, stepTo = command.split("-")
        pubMap.runStepRange(dataset, allSteps, stepFrom, stepTo, args, options)
    elif command=="all":
        pubMap.runStepRange(dataset, allSteps, "annot", "tables", args, options)
    else:
        pubMap.runStep(command, dataset, options)

def removeLock():
    global lockFname
    logging.debug("Removing lockfile %s" % lockFname)
    os.remove(lockFname)

def checkCreateLock(outDir):
    " creates lockfile, squeaks if exists, register exit handler to delete "
    global lockFname
    lockFname = join(outDir, "_pubMap.lock")
    if isfile(lockFname):
        raise Exception("File %s exists - it seems that pipeline is already running. \
        If you're sure that this is not the case, remove the lockfile and retry again" % lockFname)
    logging.debug("Creating lockfile %s" % lockFname)
    open(lockFname, "w")
    atexit.register(removeLock) # register handler that is executed on program exit

# ----------- MAIN --------------
if len(args)<2:
    parser.print_help()
    exit(1)

# setup logging
logFileName = None
dateStr = datetime.datetime.now().strftime("%d%m%y")
logFileName = join(pubConf.logDir, 'pubMap.%s.log' % dateStr)
fileMode = "a"

pubGeneric.setupLogging(__file__, options, logFileName=logFileName, fileMode=fileMode)

main(args, options)