diff --git a/data/nontn7/fncas12a.fasta b/data/nontn7/fncas12a.fasta new file mode 100644 index 00000000..26df1df6 --- /dev/null +++ b/data/nontn7/fncas12a.fasta @@ -0,0 +1,23 @@ +>sp|A0Q7Q2|CS12A_FRATN CRISPR-associated endonuclease Cas12a OS=Francisella tularensis subsp. novicida (strain U112) OX=401614 GN=cas12a PE=1 SV=1 +MSIYQEFVNKYSLSKTLRFELIPQGKTLENIKARGLILDDEKRAKDYKKAKQIIDKYHQF +FIEEILSSVCISEDLLQNYSDVYFKLKKSDDDNLQKDFKSAKDTIKKQISEYIKDSEKFK +NLFNQNLIDAKKGQESDLILWLKQSKDNGIELFKANSDITDIDEALEIIKSFKGWTTYFK +GFHENRKNVYSSNDIPTSIIYRIVDDNLPKFLENKAKYESLKDKAPEAINYEQIKKDLAE +ELTFDIDYKTSEVNQRVFSLDEVFEIANFNNYLNQSGITKFNTIIGGKFVNGENTKRKGI +NEYINLYSQQINDKTLKKYKMSVLFKQILSDTESKSFVIDKLEDDSDVVTTMQSFYEQIA +AFKTVEEKSIKETLSLLFDDLKAQKLDLSKIYFKNDKSLTDLSQQVFDDYSVIGTAVLEY +ITQQIAPKNLDNPSKKEQELIAKKTEKAKYLSLETIKLALEEFNKHRDIDKQCRFEEILA +NFAAIPMIFDEIAQNKDNLAQISIKYQNQGKKDLLQASAEDDVKAIKDLLDQTNNLLHKL +KIFHISQSEDKANILDKDEHFYLVFEECYFELANIVPLYNKIRNYITQKPYSDEKFKLNF +ENSTLANGWDKNKEPDNTAILFIKDDKYYLGVMNKKNNKIFDDKAIKENKGEGYKKIVYK +LLPGANKMLPKVFFSAKSIKFYNPSEDILRIRNHSTHTKNGSPQKGYEKFEFNIEDCRKF +IDFYKQSISKHPEWKDFGFRFSDTQRYNSIDEFYREVENQGYKLTFENISESYIDSVVNQ +GKLYLFQIYNKDFSAYSKGRPNLHTLYWKALFDERNLQDVVYKLNGEAELFYRKQSIPKK +ITHPAKEAIANKNKDNPKKESVFEYDLIKDKRFTEDKFFFHCPITINFKSSGANKFNDEI +NLLLKEKANDVHILSIDRGERHLAYYTLVDGKGNIIKQDTFNIIGNDRMKTNYHDKLAAI +EKDRDSARKDWKKINNIKEMKEGYLSQVVHEIAKLVIEYNAIVVFEDLNFGFKRGRFKVE +KQVYQKLEKMLIEKLNYLVFKDNEFDKTGGVLRAYQLTAPFETFKKMGKQTGIIYYVPAG +FTSKICPVTGFVNQLYPKYESVSKSQEFFSKFDKICYNLDKGYFEFSFDYKNFGDKAAKG +KWTIASFGSRLINFRNSDKNHNWDTREVYPTKELEKLLKDYSIEYGHGECIKAAICGESD +KKFFAKLTSVLNTILQMRNSKTGTELDYLISPVADVNGNFFDSRQAPKNMPQDADANGAY +HIGLKGLMLLGRIKNNQEGKKLNLVIKNEEYFEFVQNRNN diff --git a/data/nontn7/lbcas12a.fasta b/data/nontn7/lbcas12a.fasta new file mode 100644 index 00000000..2fb9ea28 --- /dev/null +++ b/data/nontn7/lbcas12a.fasta @@ -0,0 +1,22 @@ +>tr|A0A5S8WF58|A0A5S8WF58_9FIRM LbCas12a OS=Lachnospiraceae bacterium OX=1898203 PE=1 SV=1 +MSKLEKFTNCYSLSKTLRFKAIPVGKTQENIDNKRLLVEDEKRAEDYKGVKKLLDRYYLS +FINDVLHSIKLKNLNNYISLFRKKTRTEKENKELENLEINLRKEIAKAFKGNEGYKSLFK +KDIIETILPEFLDDKDEIALVNSFNGFTTAFTGFFDNRENMFSEEAKSTSIAFRCINENL +TRYISNMDIFEKVDAIFDKHEVQEIKEKILNSDYDVEDFFEGEFFNFVLTQEGIDVYNAI +IGGFVTESGEKIKGLNEYINLYNQKTKQKLPKFKPLYKQVLSDRESLSFYGEGYTSDEEV +LEVFRNTLNKNSEIFSSIKKLEKLFKNFDEYSSAGIFVKNGPAISTISKDIFGEWNVIRD +KWNAEYDDIHLKKKAVVTEKYEDDRRKSFKKIGSFSLEQLQEYADADLSVVEKLKEIIIQ +KVDEIYKVYGSSEKLFDADFVLEKSLKKNDAVVAIMKDLLDSVKSFENYIKAFFGEGKET +NRDESFYGDFVLAYDILLKVDHIYDAIRNYVTQKPYSKDKFKLYFQNPQFMGGWDKDKET +DYRATILRYGSKYYLAIMDKKYAKCLQKIDKDDVNGNYEKINYKLLPGPNKMLPKVFFSK +KWMAYYNPSEDIQKIYKNGTFKKGDMFNLNDCHKLIDFFKDSISRYPKWSNAYDFNFSET +EKYKDIAGFYREVEEQGYKVSFESASKKEVDKLVEEGKLYMFQIYNKDFSDKSHGTPNLH +TMYFKLLFDENNHGQIRLSGGAELFMRRASLKKEELVVHPANSPIANKNPDNPKKTTTLS +YDVYKDKRFSEDQYELHIPIAINKCPKNIFKINTEVRVLLKHDDNPYVIGIDRGERNLLY +IVVVDGKGNIVEQYSLNEIINNFNGIRIKTDYHSLLDKKEKERFEARQNWTSIENIKELK +AGYISQVVHKICELVEKYDAVIALEDLNSGFKNSRVKVEKQVYQKFEKMLIDKLNYMVDK +KSNPCATGGALKGYQITNKFESFKSMSTQNGFIFYIPAWLTSKIDPSTGFVNLLKTKYTS +IADSKKFISSFDRIMYVPEEDLFEFALDYKNFSRTDADYIKKWKLYSYGNRIRIFRNPKK +NNVFDWEEVCLTSAYKELFNKYGINYQQGDIRALLCEQSDKAFYSSFMALMSLMLQMRNS +ITGRTDVDFLISPVKNSDGIFYDSRNYEAQENAILPKNADANGAYNIARKVLWAIGQFKK +AEDEKLDKVKIAISNKEWLEYAQTSVKH diff --git a/data/nontn7/more-cas12a.fasta b/data/nontn7/more-cas12a.fasta new file mode 100644 index 00000000..6e86638c --- /dev/null +++ b/data/nontn7/more-cas12a.fasta @@ -0,0 +1,116 @@ +>WP_068647445.1 type V CRISPR-associated protein Cas12a/Cpf1 [Thiomicrospira sp. XS5] +MTKTFDSEFFNLYSLQKTVRFELKPVGETASFVEDFKNEGLKRVVSEDERRAVDYQKVKEIIDDYHRDFI +EESLNYFPEQVSKDALEQAFHLYQKLKAAKVEEREKALKEWEALQKKLREKVVKCFSDSNKARFSRIDKK +ELIKEDLINWLVAQNREDDIPTVETFNNFTTYFTGFHENRKNIYSKDDHATAISFRLIHENLPKFFDNVI +SFNKLKEGFPELKFDKVKEDLEVDYDLKHAFEIEYFVNFVTQAGIDQYNYLLGGKTLEDGTKKQGMNEQI +NLFKQQQTRDKARQIPKLIPLFKQILSERTESQSFIPKQFESDQELFDSLQKLHNNCQDKFTVLQQAILG +LAEADLKKVFIKTSDLNALSNTIFGNYSVFSDALNLYKESLKTKKAQEAFEKLPAHSIHDLIQYLEQFNS +SLDAEKQQSTDTVLNYFIKTDELYSRFIKSTSEAFTQVQPLFELEALSSKRRPPESEDEGAKGQEGFEQI +KRIKAYLDTLMEAVHFAKPLYLVKGRKMIEGLDKDQSFYEAFEMAYQELESLIIPIYNKARSYLSRKPFK +ADKFKINFDNNTLLSGWDANKETANASILFKKDGLYYLGIMPKGKTFLFDYFVSSEDSEKLKQRRQKTAE +EALAQDGESYFEKIRYKLLPGASKMLPKVFFSNKNIGFYNPSDDILRIRNTASHTKNGTPQKGHSKVEFN +LNDCHKMIDFFKSSIQKHPEWGSFGFTFSDTSDFEDMSAFYREVENQGYVISFDKIKETYIQSQVEQGNL +YLFQIYNKDFSPYSKGKPNLHTLYWKALFEEANLNNVVAKLNGEAEIFFRRHSIKASDKVVHPANQAIDN +KNPHTEKTQSTFEYDLVKDKRYTQDKFFFHVPISLNFKAQGVSKFNDKVNGFLKGNPDVNIIGIDRGERH +LLYFTVVNQKGEILVQESLNTLMSDKGHVNDYQQKLDKKEQERDAARKSWTTVENIKELKEGYLSHVVHK +LAHLIIKYNAIVCLEDLNFGFKRGRFKVEKQVYQKFEKALIDKLNYLVFKEKELGEVGHYLTAYQLTAPF +ESFKKLGKQSGILFYVPADYTSKIDPTTGFVNFLDLRYQSVEKAKQLLSDFNAIRFNSVQNYFEFEIDYK +KLTPKRKVGTQSKWVICTYGDVRYQNRRNQKGHWETEEVNVTEKLKALFASDSKTTTVIDYANDDNLIDV +ILEQDKASFFKELLWLLKLTMTLRHSKIKSEDDFILSPVKNEQGEFYDSRKAGEVWPKDADANGAYHIAL +KGLWNLQQINQWEKGKTLNLAIKNQDWFSFIQEKPYQE +>WP_039871282.1 type V CRISPR-associated protein Cas12a/Cpf1 [Prevotella bryantii] +MKFTDFTGLYSLSKTLRFELKPIGKTLENIKKAGLLEQDQHRADSYKKVKKIIDEYHKAFIEKSLSNFEL +KYQSEDKLDSLEEYLMYYSMKRIEKTEKDKFAKIQDNLRKQIADHLKGDESYKTIFSKDLIRKNLPDFVK +SDEERTLIKEFKDFTTYFKGFYENRENMYSAEDKSTAISHRIIHENLPKFVDNINAFSKIILIPELREKL +NQIYQDFEEYLNVESIDEIFHLDYFSMVMTQKQIEVYNAIIGGKSTNDKKIQGLNEYINLYNQKHKDCKL +PKLKLLFKQILSDRIAISWLPDNFKDDQEALDSIDTCYKNLLNDGNVLGEGNLKLLLENIDTYNLKGIFI +RNDLQLTDISQKMYASWNVIQDAVILDLKKQVSRKKKESAEDYNDRLKKLYTSQESFSIQYLNDCLRAYG +KTENIQDYFAKLGAVNNEHEQTINLFAQVRNAYTSVQAILTTPYPENANLAQDKETVALIKNLLDSLKRL +QRFIKPLLGKGDESDKDERFYGDFTPLWETLNQITPLYNMVRNYMTRKPYSQEKIKLNFENSTLLGGWDL +NKEHDNTAIILRKNGLYYLAIMKKSANKIFDKDKLDNSGDCYEKMVYKLLPGANKMLPKVFFSKSRIDEF +KPSENIIENYKKGTHKKGANFNLADCHNLIDFFKSSISKHEDWSKFNFHFSDTSSYEDLSDFYREVEQQG +YSISFCDVSVEYINKMVEKGDLYLFQIYNKDFSEFSKGTPNMHTLYWNSLFSKENLNNIIYKLNGQAEIF +FRKKSLNYKRPTHPAHQAIKNKNKCNEKKESIFDYDLVKDKRYTVDKFQFHVPITMNFKSTGNTNINQQV +IDYLRTEDDTHIIGIDRGERHLLYLVVIDSHGKIVEQFTLNEIVNEYGGNIYRTNYHDLLDTREQNREKA +RESWQTIENIKELKEGYISQVIHKITDLMQKYHAVVVLEDLNMGFMRGRQKVEKQVYQKFEEMLINKLNY +LVNKKADQNSAGGLLHAYQLTSKFESFQKLGKQSGFLFYIPAWNTSKIDPVTGFVNLFDTRYESIDKAKA +FFGKFDSIRYNADKDWFEFAFDYNNFTTKAEGTRTNWTICTYGSRIRTFRNQAKNSQWDNEEIDLTKAYK +AFFAKHGINIYDNIKEAIAMETEKSFFEDLLHLLKLTLQMRNSITGTTTDYLISPVHDSKGNFYDSRICD +NSLPANADANGAYNIARKGLMLIQQIKDSTSSNRFKFSPITNKDWLIFAQEKPYLND +>WP_115247861.1 type V CRISPR-associated protein Cas12a/Cpf1 [Moraxella lacunata] +MLFQDFTHLYPLSKTVRFELKPIGKTLEHIHAKNFLSQDETMADMYQKVKAILDDYHRDFITKMMSEVTL +TKLPEFYEVYLALRKNPKDDTLQKQLTEIQTALREEVVKPIDSGGKYKAGYERLFGAKLFKDGKELGDLA +KFVIAQEGESSPKLPQIAHFEKFSTYFTGFHDNRKNMYSSDDKHTAIAYRLIHENLPRFIDNLQILVTIK +QKHSVLYDQIVNELNANGLDVSLASHLDGYHKLLTQEGITAYNRIIGEVNSYTNKHNQICHKSERIAKLR +PLHKQILSDGMGVSFLPSKFADDSEMCQAVNEFYRHYAHVFAKVQSLFDRFDDYQKDGIYVEHKNLNELS +KQAFGDFALLGRVLDGYYVDVVNPEFNDKFAKAKTDNAKEKLTKEKDKFIKGVHSLASLEQAIEHYIAGH +DDESVQAGKLGQYFKHGLAGVDNPIQKIHNSHSTIKGFLERERPAGERTLPKIKSDKSLEMTQLRQLKEL +LDNALNVVHFAKLLTTKTTLDNQDGNFYGEFGALYDELAKIATLYNKVRDYLSQKPFSTEKYKLNFGNPT +LLNGWDLNKEKDNFGVILQKDGCYYLALLDKAHKKVFDNAPNTGKSVYQKMVYKLLPGPNKMLPKVFFAK +SNLDYYNPSAELLDKYAQGTHKKGDNFNLKDCHALIDFFKASINKHPEWQHFGFEFSLTSSYQDLSDFYR +EVEPQGYQVKFVDIDADYIDELVEQGQLYLFQIYNKDFSPKAHGKPNLHTLYFKALFSEDNLANPIYKLN +GEAEIFYRKASLDMNETTIHRAGEVLENKNPDNPKERQFVYDIIKDKRYTQDKFMLHVPITMNFGVQGMT +IKEFNKKVNQSIQQYDEVNVIGIDRGERHLLYLTVINSKGEILEQRSLNDIITTSANGTQMTTPYHKILD +KREIERLNARVGWGEIETIKELKSGYLSHVVHQISQLMLKYNAIVVLEDLNFGFKRGRFKVEKQIYQNFE +NALIKKLNHLVLKDKADNEIGSYKNALQLTNNFTDLKSIGKQTGFLFYVPAWNTSKIDPVTGFVDLLKPR +YENIAQSQAFFDKFDKICYNADKGYFEFHIDYAKFTDKAKNSRQIWTICSHGDKRYVYDKTANQNKGATI +GINVNDELKSLFARYRINDKQPNLVMDICQNNDKEFHKSLTYLLKALLALRYSNASSDEDFILSPVANDK +GVFFNSALADDTQPQNADANGAYHIALKGLWLLNELKNSDDLDKVKLAIDNQTWLNFAQNR +>WP_035798880.1 type V CRISPR-associated protein Cas12a/Cpf1 [Butyrivibrio sp. NC3005] +MYYQNLTKKYPVSKTIRNELIPIGKTLENIRKNNILESDVKRKQDYEHVKGIMDEYHKQLINEALDNYML +PSLNQAAEIYLKKHVDVEDREEFKKTQDLLRREVTGRLKEHENYTKIGKKDILDLLEKLPSISEEDYNAL +ESFRNFYTYFTSYNKVRENLYSDEEKSSTVAYRLINENLPKFLDNIKSYAFVKAAGVLADCIEEEEQDAL +FMVETFNMTLTQEGIDMYNYQIGKVNSAINLYNQKNHKVEEFKKIPKMKVLYKQILSDREEVFIGEFKDD +ETLLSSIGAYGNVLMTYLKSEKINIFFDALRESEGKNVYVKNDLSKTTMSNIVFGSWSAFDELLNQEYDL +ANENKKKDDKYFEKRQKELKKNKSYTLEQMSNLSKEDISPIENYIERISEDIEKICIYNGEFEKIVVNEH +DSSRKLSKNIKAVKVIKDYLDSIKELEHDIKLINGSGQELEKNLVVYVGQEEALEQLRPVDSLYNLTRNY +LTKKPFSTEKVKLNFNKSTLLNGWDKNKETDNLGILFFKDGKYYLGIMNTTANKAFVNPPAAKTENVFKK +VDYKLLPGSNKMLPKVFFAKSNIGYYNPSTELYSNYKKGTHKKGPSFSIDDCHNLIDFFKESIKKHEDWS +KFGFEFSDTADYRDISEFYREVEKQGYKLTFTDIDESYINDLIEKNELYLFQIYNKDFSEYSKGKLNLHT +LYFMMLFDQRNLDNVVYKLNGEAEVFYRPASIAENELVIHKAGEGIKNKNPNRAKVKETSTFSYDIVKDK +RYSKYKFTLHIPITMNFGVDEVRRFNDVINNALRTDDNVNVIGIDRGERNLLYVVVINSEGKILEQISLN +SIINKEYDIETNYHALLDEREDDRNKARKDWNTIENIKELKTGYLSQVVNVVAKLVLKYNAIICLEDLNF +GFKRGRQKVEKQVYQKFEKMLIEKLNYLVIDKSREQVSPEKMGGALNALQLTSKFKSFAELGKQSGIIYY +VPAYLTSKIDPTTGFVNLFYIKYENIEKAKQFFDGFDFIRFNKKDDMFEFSFDYKSFTQKACGIRSKWIV +YTNGERIIKYPNPEKNNLFDEKVINVTDEIKGLFKQYRIPYENGEDIKEIIISKAEADFYKRLFRLLHQT +LQMRNSTSDGTRDYIISPVKNDRGEFFCSEFSEGTMPKDADANGAYNIARKGLWVLEQIRQKDEGEKVNL +SMTNAEWLKYAQLHLL +>WP_027216152.1 type V CRISPR-associated protein Cas12a/Cpf1 [Butyrivibrio fibrisolvens] +MYYESLTKLYPIKKTIRNELVPIGKTLENIKKNNILEADEDRKIAYIRVKAIMDDYHKRLINEALSGFAL +IDLDKAANLYLSRSKSADDIESFSRFQDKLRKAIAKRLREHENFGKIGNKDIIPLLQKLSENEDDYNALE +SFKNFYTYFESYNDVRLNLYSDKEKSSTVAYRLINENLPRFLDNIRAYDAVQKAGITSEELSSEAQDGLF +LVNTFNNVLIQDGINTYNEDIGKLNVAINLYNQKNASVQGFRKVPKMKVLYKQILSDREESFIDEFESDT +ELLDSLESHYANLAKYFGSNKVQLLFTALRESKGVNVYVKNDIAKTSFSNVVFGSWSRIDELINGEYDDN +NNRKKDEKYYDKRQKELKKNKSYTIEKIITLSTEDVDVIGKYIEKLESDIDDIRFKGKNFYEAVLCGHDR +SKKLSKNKGAVEAIKGYLDSVKDFERDLKLINGSGQELEKNLVVYGEQEAVLSELSGIDSLYNMTRNYLT +KKPFSTEKIKLNFNKPTFLDGWDYGNEEAYLGFFMIKEGNYFLAVMDANWNKEFRNIPSVDKSDCYKKVI +YKQISSPEKSIQNLMVIDGKTVKKNGRKEKEGIHSGENLILEELKNTYLPKKINDIRKRRSYLNGDTFSK +KDLTEFIGYYKQRVIEYYNGYSFYFKSDDDYASFKEFQEDVGRQAYQISYVDVPVSFVDDLINSGKLYLF +RVYNKDFSEYSKGRLNLHTLYFKMLFDERNLKNVVYKLNGQAEVFYRPSSIKKEELIVHRAGEEIKNKNP +KRAAQKPTRRLDYDIVKDRRYSQDKFMLHTSIIMNFGAEENVSFNDIVNGVLRNEDKVNVIGIDRGERNL +LYVVVIDPEGKILEQRSLNCITDSNLDIETDYHRLLDEKESDRKIARRDWTTIENIKELKAGYLSQVVHI +VAELVLKYNAIICLEDLNFGFKRGRQKVEKQVYQKFEKMLIDKLNYLVMDKSREQLSPEKISGALNALQL +TPDFKSFKVLGKQTGIIYYVPAYLTSKIDPMTGFANLFYVKYENVDKAKEFFSKFDSIKYNKDGKNWNTK +GYFEFAFDYKKFTDRAYGRVSEWTVCTVGERIIKFKNKEKNNSYDDKVIDLTNSLKELFDSYKVTYESEV +DLKDAILAIDDPAFYRDLTRRLQQTLQMRNSSCDGSRDYIISPVKNSKGEFFCSDNNDDTTPNDADANGA +FNIARKGLWVLNEIRNSEEGSKINLAMSNAQWLEYAQDNTI +>EFI15981.1 conserved hypothetical protein [Bacteroidetes oral taxon 274 str. F0058] +MRKFNEFVGLYPISKTLRFELKPIGKTLEHIQRNKLLEHDAVRADDYVKVKKIIDKYHKCLIDEALSGFT +FDTEADGRSNNSLSEYYLYYNLKKRNEQEQKTFKTIQNNLRKQIVNKLTQSEKYKRIDKKELITTDLPDF +LTNESEKELVEKFKNFTTYFTEFHKNRKNMYSKEEKSTAIAFRLINENLPKFVDNIAAFEKVVSSPLAEK +INALYEDFKEYLNVEEISRVFRLDYYDELLTQKQIDLYNAIVGGRTEEDNKIQIKGLNQYINEYNQQQTD +RSNRLPKLKPLYKQILSDRESVSWLPPKFDSDKNLLIKIKECYDALSEKEKVFDKLESILKSLSTYDLSK +IYISNDSQLSYISQKMFGRWDIISKAIREDCAKRNPQKSRESLEKFAERIDKKLKTIDSISIGDVDECLA +QLGETYVKRVEDYFVAMGESEIDDEQTDTTSFKKNIEGAYESVKELLNNADNITDNNLMQDKGNVEKIKT +LLDAIKDLQRFIKPLLGKGDEADKDGVFYGEFTSLWTKLDQVTPLYNMVRNYLTSKPYSTKKIKLNFENS +TLMDGWDLNKEPDNTTVIFCKDGLYYLGIMGKKYNRVFVDREDLPHDGECYDKMEYKLLPGANKMLPKVF +FSETGIQRFLPSEELLGKYERGTHKKGAGFDLGDCRALIDFFKKSIERHDDWKKFDFKFSDTSTYQDISE +FYREVEQQGYKMSFRKVSVDYIKSLVEEGKLYLFQIYNKDFSAHSKGTPNMHTLYWKMLFDEENLKDVVY +KLNGEAEVFFRKSSITVQSPTHPANSPIKNKNKDNQKKESKFEYDLIKDRRYTVDKFLFHVPITMNFKSV +GGSNINQLVKRHIRSATDLHIIGIDRGERHLLYLTVIDSRGNIKEQFSLNEIVNEYNGNTYRTDYHELLD +TREGERTEARRNWQTIQNIRELKEGYLSQVIHKISELAIKYNAVIVLEDLNFGFMRSRQKVEKQVYQKFE +KMLIDKLNYLVDKKKPVAETGGLLRAYQLTGEFESFKTLGKQSGILFYVPAWNTSKIDPVTGFVNLFDTH +YENIEKAKVFFDKFKSIRYNSDKDWFEFVVDDYTRFSPKAEGTRRDWTICTQGKRIQICRNHQRNNEWEG +QEIDLTKAFKEHFEAYGVDISKDLREQINTQNKKEFFEELLRLLRLTLQMRNSMPSSDIDYLISPVANDT +GCFFDSRKQAELKENAVLPMNADANGAYNIARKGLLAIRKMKQEENDSAKISLAISNKEWLKFAQTKPYL +ED diff --git a/src/nontn7/README.md b/src/nontn7/README.md index d10dd701..ef2ecffc 100644 --- a/src/nontn7/README.md +++ b/src/nontn7/README.md @@ -12,6 +12,7 @@ A pipeline for discovering non-Tn7 CASTs. - `NCBI Blast 2.10+` [https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download) - `ripgrep` [https://github.com/BurntSushi/ripgrep](https://github.com/BurntSushi/ripgrep) - `fd` [https://github.com/sharkdp/fd](https://github.com/sharkdp/fd) + - `seqkit 0.14.0` [https://bioinf.shenwei.me/seqkit/](https://bioinf.shenwei.me/seqkit/) ### Data Dependencies diff --git a/src/nontn7/main.sh b/src/nontn7/main.sh index b218d884..b13da7d1 100755 --- a/src/nontn7/main.sh +++ b/src/nontn7/main.sh @@ -411,3 +411,13 @@ for cluster in 24 50 122 154 do; done | python simple-reblast.py $BLASTN_DB $BLASTP_DB $ARRAYS_OPTIONAL_DIRECTORY/all/cas12.fully-analyzed/reblast cat $ARRAYS_OPTIONAL_DIRECTORY/all/cas12.fully-analyzed/reblast/*csv | python minced.py | python find-cas12-sts.py > $OUTPUT/cas12-rpn-candidates.csv + +# Assemble canonical nuclease-active Cas12a proteins to align them with the Rpn-associated ones +if [[ ! -e $OUTPUT/cas12.afa ]]; then + cat $DATA/cas12.fasta $DATA/lbcas12a.fasta $DATA/fncas12a.fasta $DATA/more-cas12a.fasta > $OUTPUT/cas12.fasta + + # Add the Rpn-associated Cas12 proteins + # We eliminate the "ORPV" operon since it's a really tiny contig and not an Rpn-associated system + python make_effector_fasta.py Cpf1 < $OUTPUT/cas12-rpn-candidates.csv | seqkit rmdup -s | rg -v ORPV >> $OUTPUT/cas12.fasta + mafft --localpair --maxiterate 10000 $OUTPUT/cas12.fasta > $OUTPUT/cas12.afa +fi diff --git a/src/nontn7/make_effector_fasta.py b/src/nontn7/make_effector_fasta.py index cfa42d6e..c879586c 100644 --- a/src/nontn7/make_effector_fasta.py +++ b/src/nontn7/make_effector_fasta.py @@ -10,7 +10,7 @@ def get_lowest_evalue_protein(operon: genes.Operon, protein_name: str) -> Optional[genes.Feature]: - proteins = sorted([feature for feature in operon.get(protein_name)], key=lambda x: x.e_val) + proteins = sorted([feature for feature in operon.get(protein_name, regex=True)], key=lambda x: x.e_val) if proteins: return proteins[0] return None