Basic pLAST usage

Imports and configuration

[1]:
import logging
from pathlib import Path
import pandas as pd
from plast.data import PLASTData
from plast.plast import PLAST
from examples.configuration import config

Load configuration to PLASTData object

[2]:
pl_data = PLASTData(config)
Loading pLAST data...

Set logger to “warning” to not print too much informations

[3]:
logger = logging.getLogger("plast")
logger.setLevel(logging.WARNING)

Analyze sequence using MMseqs2 model

Initialize PLAST object - load config, choose model and add logger

[4]:
plast = PLAST(data=pl_data, model="mmseqs2_model", logger=logger)

Load example plasmid sequence from .fasta file and annotate with prodigal

[5]:
plast = plast.load_nt_fasta(open("examples/NZ_AP018444.1.fasta").read())
print("Plasmid of langth", plast.length, "has the following annotation:")
display(plast.parsed)
Plasmid of langth 181607 has the following annotation:
locus_tag start end strand type gene product translation
0 NZ_AP018444.1_1 3 410 1 CDS FLIGTLLLFGLAACDSIKSVASDVTVGKVIEEFKAAGLEAEQPSDL...
1 NZ_AP018444.1_2 561 776 -1 CDS MKNKRDRLAILPAIDSFINFISITLTINARNNGINSTMASLIAMSF...
2 NZ_AP018444.1_3 848 937 -1 CDS MLQKEEKVMGKTLFIAEKLKVANEIIKSL*
3 NZ_AP018444.1_4 1127 1765 -1 CDS MEVLIFELILIAVLIPLNSVVKKHVPKWKGKAGEKLVKRMLSKLDP...
4 NZ_AP018444.1_5 1782 2015 -1 CDS MGTVKNVEKRLPTWANFGVIEKPVNRRVTRPFQFRGGLLVSLALGF...
... ... ... ... ... ... ... ... ...
187 NZ_AP018444.1_188 175616 176212 -1 CDS MSACSSSAQTKNDSRPAQAVQNGIQQHVEGKDIVDIPEAYKRKLKG...
188 NZ_AP018444.1_189 176689 179352 -1 CDS MGKTLFIAEKPKVANEIMKSPRFRHSQKYIGSKPYYGYYENDHYIV...
189 NZ_AP018444.1_190 179919 180323 -1 CDS MQQCLKYNCGTIQLEDLQGISKEQAFLKNWTYFDLQEKIKNQANQY...
190 NZ_AP018444.1_191 180464 181204 -1 CDS MKYIYVHFHYHFRTIGNRTIQKLWEYDNQSLKHFKDTGQYPSAQQL...
191 NZ_AP018444.1_192 181546 181605 1 CDS MISKDLSTVGGTHKYVKKII

192 rows × 8 columns

Assign cluster identifiers to the plasmid proteins

[6]:
plast = plast.assign_mmseqs_clusters(use_gpu=False, threads=12)
print("Number of assigned clusters:", len(plast.vector))
print("Assigned clusters:")
print(plast.vector)
Number of assigned clusters: 192
Assigned clusters:
['158139', None, None, '111227', None, '363037', '180971', '367331', '332882', '22817', '170621', '57630', '199461', '59114', '9057', '150228', '27549', '177933', '256276', '171502', '222120', '131075', '279450', None, '292502', '372101', None, '349723', None, None, '219413', '268273', '348017', '239222', '195393', None, '29918', None, '29918', '357007', '252631', '379137', '63194', '281371', '271498', '14424', None, '62528', '159682', None, '95717', '14424', '14424', '372574', '77937', '248555', '370932', '116939', '348722', '116631', '288608', '62545', '181715', None, '260525', '127064', '65352', '354243', '13', '307598', '293539', '330390', '79368', '331546', '357602', '128987', '261376', None, '366603', '366603', '189155', '297412', '283545', '52421', None, '160853', '373949', '280027', '95716', '113193', '129570', '368936', '38910', '366768', '294849', '254013', None, '6994', '366724', '136722', '8135', None, '244957', '210862', '81929', None, None, '8645', '245020', None, '68864', '386183', None, None, '330287', '60086', None, '335942', '227142', '123577', '122666', '106646', '60005', '73409', '263758', '308699', '180655', '71936', None, '275472', None, None, None, None, None, '53569', '305343', '37746', '146411', None, '204716', '34535', '271497', '90426', '58700', '227877', '273403', '320114', None, '156029', None, None, None, '246224', None, None, '364912', '344084', '240006', None, '271473', '91528', '91993', '176191', None, '116489', '234981', '374753', '56767', '106646', '122666', '19209', '19209', '122666', '330484', '14956', '146511', '220332', '274341', '303134', '280768', '3116', '126570', '71819', '156029', '76830', '333634', '8703', '116829', None, None, None]

Pass data through pLAST model and receive embeddings

[7]:
plast = plast.encode()
print("Plasmid embedding:")
print(plast.embedding)
Plasmid embedding:
[ 0.16074932  0.07090153 -0.05637924  0.01270099 -0.13037615  0.23649618
 -0.09985632  0.18405771 -0.12629367  0.08232442 -0.03229282 -0.30928066
  0.03496516 -0.15260804 -0.01811741 -0.00062386 -0.05955299  0.04845381
 -0.15461048  0.09944806 -0.07227306  0.07942476 -0.04140659 -0.01181537
  0.09717287  0.08005133  0.08573128 -0.12106762  0.00138966  0.22787371
 -0.24396281  0.13145718 -0.06370046 -0.06037332  0.26901937 -0.19918399
 -0.06123837  0.02366692  0.09150934 -0.13937949 -0.10196497  0.01230744
  0.05456771  0.08498187 -0.01927721 -0.13616739 -0.02521401  0.06099558
  0.0216114   0.02579266  0.17739704  0.06502925  0.03666644 -0.07709105
 -0.16628736 -0.07768066 -0.01253821 -0.31088704  0.03946643  0.05838454
  0.01965593  0.20820118  0.1418348   0.21397436]

Find similar plasmids in the database

[8]:
results = plast.get_most_similar(maxret=15)
[9]:
df = pd.DataFrame.from_dict(results, orient="index")

for c in ("pLAST_distance", "gc"):
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").round(6)

preferred = [
    "pLAST_distance",
    "length",
    "gc",
    "taxid",
    "organism",
    "definition",
    "taxonomy",
    "rep_type(s)",
    "AMR",
    "relaxase_type(s)",
    "mpf_type",
    "orit_type(s)",
    "predicted_mobility",
]
cols = [c for c in preferred if c in df.columns] + [
    c for c in df.columns if c not in preferred
]
display(df[cols])
pLAST_distance length gc taxid organism definition taxonomy rep_type(s) AMR relaxase_type(s) mpf_type orit_type(s) predicted_mobility primary_cluster_id
NZ_CP147792 0.994295 181634 0.325292 1392 Bacillus anthracis Bacillus anthracis strain Sterne-CLR1-12 plasm... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP076202 0.994227 184456 0.325503 1392 Bacillus anthracis Bacillus anthracis strain A3783 plasmid pXO1, ... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP022045 0.994065 181634 0.325292 1392 Bacillus anthracis Bacillus anthracis strain FDAARGOS_341 plasmid... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP147790 0.994065 181634 0.325292 1392 Bacillus anthracis Bacillus anthracis strain Sterne-CLR1-2 plasmi... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP065741 0.993964 190859 0.326330 2026188 Bacillus tropicus Bacillus tropicus strain FDAARGOS_897 plasmid ... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NC_010934 0.993878 190861 0.326339 1396 Bacillus cereus Bacillus cereus plasmid pBCXO1, complete sequence Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP167925 0.993756 181637 0.325292 1392 Bacillus anthracis Bacillus anthracis strain CWH_147 plasmid pXO1... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP140727 0.993629 181719 0.325194 1392 Bacillus anthracis Bacillus anthracis strain MH-PR plasmid pXO1, ... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP047132 0.993583 181933 0.325389 1213182 Bacillus anthracis str. BF1 Bacillus anthracis str. BF1 plasmid pXO1, comp... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP060195 0.993508 181677 0.325303 1392 Bacillus anthracis Bacillus anthracis strain Kanchipuram plasmid ... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP140725 0.993500 180367 0.325173 1392 Bacillus anthracis Bacillus anthracis strain MH-JJ plasmid pXO1, ... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP147782 0.993409 181634 0.325292 1392 Bacillus anthracis Bacillus anthracis strain Sterne-CLR2-46 plasm... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP140730 0.993280 181707 0.325210 1392 Bacillus anthracis Bacillus anthracis strain MH-MFM plasmid pXO1,... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP140721 0.993245 181736 0.325197 1392 Bacillus anthracis Bacillus anthracis strain MH-VW plasmid pXO1, ... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP009463 0.993232 180693 0.325475 1392 Bacillus anthracis Bacillus anthracis strain SK-102 plasmid pXO1,... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800

Analyze sequence using eggNOG model

Initialize PLAST object - load config, choose model and add logger

[10]:
plast = PLAST(data=pl_data, model="eggnog_model", logger=logger)

Load example plasmid sequence and GenBank annotation from .gb file

[11]:
plast = plast.load_gbff(Path("examples/NZ_AP018444.1.gb"))
print("Plasmid of langth", plast.length, "has the following annotation:")
display(plast.parsed)
Plasmid of langth 181607 has the following annotation:
start end strand partial type coordinates locus_tag old_locus_tag inference note ... product protein_id translation GO_function pseudo gene EC_number GO_process GO_component gene_synonym
1 598 807 1 0 CDS 598..807 BAZ_RS27670 NaN COORDINATES: similar to AAsequence:RefSeq:WP_0... Derived by automated computational analysis us... ... hypothetical protein WP_003171438.1 MYIIINFEPLSPVMNDIAIKLAMVLFIPLFLALIVKVILMKFMKES... NaN NaN NaN NaN NaN NaN NaN
2 1127 1765 -1 0 CDS complement(1127..1765) BAZ_RS27675 BAZ_pXO1_00003 COORDINATES: similar to AAsequence:RefSeq:WP_0... Derived by automated computational analysis us... ... nuclease-related domain-containing protein WP_000453400.1 MEVLIFELILIAVLIPLNSVVKKHVPKWKGKAGEKLVKRMLSKLDP... GO:0003677 - DNA binding [Evidence IEA];GO:000... NaN NaN NaN NaN NaN NaN
3 1782 2165 -1 0 CDS complement(1782..2165) BAZ_RS27680 BAZ_pXO1_00004 COORDINATES: similar to AAsequence:RefSeq:WP_0... Derived by automated computational analysis us... ... hypothetical protein WP_003159741.1 MNRNREYIALRNSISIGWLLTCIVIGLSIYFSSVVYLLIGCFILSL... NaN NaN NaN NaN NaN NaN NaN
4 2291 2875 1 0 CDS 2291..2875 BAZ_RS27685 BAZ_pXO1_00005 COORDINATES: similar to AAsequence:RefSeq:WP_0... Derived by automated computational analysis us... ... hypothetical protein WP_000916417.1 MLKLVVNNTGTEPEGDSAFSNYFTCKDCVYYLSKSDSCSLQLAADS... NaN NaN NaN NaN NaN NaN NaN
5 2895 3473 1 0 CDS 2895..3473 BAZ_RS27690 BAZ_pXO1_00006 COORDINATES: similar to AAsequence:RefSeq:WP_0... Derived by automated computational analysis us... ... hypothetical protein WP_002194483.1 MLEHTILFSLVQLVPLIVSTGIIYILLELTDTEWKTWFSYEGIFAI... NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
161 175000 175287 -1 0 CDS complement(175000..175287) BAZ_RS28520 BAZ_pXO1_00191 COORDINATES: similar to AAsequence:RefSeq:WP_0... Derived by automated computational analysis us... ... hypothetical protein WP_000084556.1 MSQVPGFLKFVLAKERRYVYLAVAEKKNKRILTHIVYRFGPLEKAL... NaN NaN NaN NaN NaN NaN NaN
162 175364 175564 -1 0 CDS complement(175364..175564) BAZ_RS28525 BAZ_pXO1_00192 COORDINATES: similar to AAsequence:RefSeq:WP_0... Derived by automated computational analysis us... ... hypothetical protein WP_000344165.1 MDENKRNMLLSFIISILFIFTSLLPFSNNEYVYVISKIGAAAGVIN... NaN NaN NaN NaN NaN NaN NaN
163 175616 176260 -1 0 CDS complement(175616..176260) BAZ_RS28530 BAZ_pXO1_00193 COORDINATES: similar to AAsequence:RefSeq:WP_0... Derived by automated computational analysis us... ... thermonuclease family protein WP_000708136.1 MKIWIKAICITSFVIQMSACSSSAQTKNDSRPAQAVQNGIQQHVEG... NaN NaN NaN NaN NaN NaN NaN
164 176689 179352 -1 0 CDS complement(176689..179352) BAZ_RS28540 BAZ_pXO1_00195 COORDINATES: similar to AAsequence:RefSeq:WP_0... Derived by automated computational analysis us... ... type IA DNA topoisomerase WP_000520727.1 MGKTLFIAEKPKVANEIMKSPRFRHSQKYIGSKPYYGYYENDHYIV... GO:0003677 - DNA binding [Evidence IEA];GO:000... NaN NaN 5.6.2.1 GO:0006265 - DNA topological change [EvidenceIEA] NaN NaN
0 181589 181607 1 0 CDS join(181589..181607,1..410) BAZ_RS27665 BAZ_pXO1_00001 COORDINATES: similar to AAsequence:RefSeq:WP_0... Derived by automated computational analysis us... ... hypothetical protein WP_000914414.1 MLKKLSVFLIGTLLLFGLAACDSIKSVASDVTVGKVIEEFKAAGLE... NaN NaN NaN NaN NaN NaN NaN

136 rows × 22 columns

Assign cluster identifiers to the plasmid proteins

[12]:
plast = plast.assign_eggnog_annot(processes=12)
print("Number of assigned clusters:", len(plast.vector))
print("Assigned clusters:")
print(plast.vector)
Number of assigned clusters: 136
Assigned clusters:
[None, '31K0D', None, '30DG3', None, '32907', None, None, 'COG3344', 'COG0553', None, 'COG0827', None, '2ZFG1', None, '2Z9QM', '2Z9QM', '30AUC', '32EU5', '33QRF', 'COG4973', 'COG0675', 'COG2250', 'COG3344', 'COG1708', None, None, None, 'COG3950', 'COG2865', '2ZWMK', 'COG0539', '30VTJ', 'COG1309.used_alg.fa', None, 'COG4823', 'COG0454', 'COG3385', 'COG3385', None, 'COG1476', '33G6F', '308XQ', 'COG3505', None, '30DRN', '349HH', '2ZM2H', '30JG6', None, '33PUD', '32AWE', 'COG4963', None, None, 'COG2866', None, None, None, 'COG3745', 'COG4963', 'COG4962', 'COG4965', '3294H', '3294H', '30JM3', 'COG4072', 'COG4960', None, 'COG5464', '30EMT', None, '30D9P', '30EJP', '336S4', 'COG3451', None, None, '33KVR', '33F95', 'COG1266', 'COG2311', '307WB', 'COG5386', None, '345N2', 'COG2866', 'COG0671', None, 'COG1210', '2Z8KQ', '33354', None, None, None, '2ZJNR', 'COG3711', None, None, None, 'COG4644', 'COG1961', '2ZAZ1', None, '2ZA3G', None, None, '32ZBI', 'COG0640', '30AH4', None, None, 'COG4973', None, '33Y12', None, None, '33ZVC', '32U5N', '332Y6', 'COG3443', 'COG2357', 'COG4973', 'COG3410', None, '319NK', '33JGE', None, None, 'COG1923', 'COG0640', '30EJD', None, 'COG1525', 'COG0550', '331IF']

Pass data through pLAST model and receive embeddings

[13]:
plast = plast.encode()
print("Plasmid embedding:")
print(plast.embedding)
Plasmid embedding:
[ 0.0589677   0.05433606 -0.1876128   0.03643819  0.24761888  0.06717072
  0.03598635 -0.2240049   0.14677612  0.08280926 -0.08353736 -0.06939356
  0.07016669  0.08643539 -0.05044083 -0.09069608 -0.10310914  0.16306525
  0.23802327  0.20261437  0.08285122  0.04954447 -0.25138718 -0.05838021
  0.04297755 -0.03393299  0.02068523 -0.1865046  -0.17108573  0.07016291
  0.01318081 -0.03065709  0.00491979 -0.10776813 -0.09607697 -0.03169782
 -0.06032306  0.05970465 -0.22047132 -0.2629159   0.15037817  0.04737961
 -0.06956949  0.0237163   0.00524222  0.05974862  0.2203716   0.32091218
 -0.08670954 -0.12627028  0.09016269  0.06958341  0.19005509 -0.05200532
 -0.09510881  0.01440437  0.10063916 -0.13191333 -0.00934572 -0.00705839
 -0.0150359   0.06664705 -0.1576495   0.13453296]

Find similar plasmids in the database

[14]:
results = plast.get_most_similar(maxret=15)
[15]:
df = pd.DataFrame.from_dict(results, orient="index")

for c in ("pLAST_distance", "gc"):
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").round(6)

preferred = [
    "pLAST_distance",
    "length",
    "gc",
    "taxid",
    "organism",
    "definition",
    "taxonomy",
    "rep_type(s)",
    "AMR",
    "relaxase_type(s)",
    "mpf_type",
    "orit_type(s)",
    "predicted_mobility",
]
cols = [c for c in preferred if c in df.columns] + [
    c for c in df.columns if c not in preferred
]
display(df[cols])
pLAST_distance length gc taxid organism definition taxonomy rep_type(s) AMR relaxase_type(s) mpf_type orit_type(s) predicted_mobility primary_cluster_id
NZ_CP010321 0.972325 181741 0.325353 1392 Bacillus anthracis Bacillus anthracis strain Canadian Bison isola... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP091763 0.971691 181764 0.325312 1392 Bacillus anthracis Bacillus anthracis strain PNO2 plasmid pXO1, c... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP010853 0.970925 181663 0.325333 1392 Bacillus anthracis Bacillus anthracis strain A1144 plasmid pXO1, ... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP115871 0.970012 181667 0.325243 1392 Bacillus anthracis Bacillus anthracis strain IAL 52 plasmid pXO1,... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP009597 0.969928 181710 0.325194 1449979 Bacillus anthracis str. V770-NP-1R Bacillus anthracis str. V770-NP-1R plasmid pXO... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP014177 0.969738 181677 0.325293 1392 Bacillus anthracis Bacillus anthracis strain Stendal plasmid pXO1... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP015777 0.969738 181677 0.325297 1392 Bacillus anthracis Bacillus anthracis strain Tangail-1 plasmid pX... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP076208 0.969727 181766 0.325292 1392 Bacillus anthracis Bacillus anthracis strain Pollino 3734 plasmid... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP076193 0.969714 181763 0.325303 1392 Bacillus anthracis Bacillus anthracis strain UR-1 plasmid pXO1, c... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP009980 0.969302 181674 0.325302 1392 Bacillus anthracis Bacillus anthracis strain Ames isolate BACI008... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP012520 0.969289 181658 0.325331 1392 Bacillus anthracis Bacillus anthracis strain Larissa plasmid pXO1... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_AP018444 0.968104 181607 0.325395 1392 Bacillus anthracis Bacillus anthracis strain CZC5 plasmid pXO1, c... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP009696 0.967697 181920 0.325390 1392 Bacillus anthracis Bacillus anthracis strain RA3 plasmid pXO1, co... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP147784 0.967578 181634 0.325292 1392 Bacillus anthracis Bacillus anthracis strain Sterne-CLR2-13 plasm... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800
NZ_CP008847 0.967434 181894 0.325354 1392 Bacillus anthracis Bacillus anthracis strain HYU01 plasmid pX01, ... Bacteria;Bacillota;Bacilli;Bacillales;Bacillac... rc125 - MOBP MPF_T - conjugative AA800