Building a dataset for machine learning
Use localpdb to create a dataset for machine learning purpose¶
Within this notebook we demonstrate the procedures used to derive the dataset used to train a coiled-coil domain prediction method - DeepCoil (Repository, Paper). The steps described here can be easily adapted to any other task of interest, e.g. secondary structure prediction, other motif detection etc.
Imports and definitions¶
In [1]:
Copied!
import pandas as pd
import os
from localpdb import PDB
NCBI_PATH='/opt/apps/ncbi-blast+/bin/'
NP = 16
import pandas as pd
import os
from localpdb import PDB
NCBI_PATH='/opt/apps/ncbi-blast+/bin/'
NP = 16
Helper functions¶
In [2]:
Copied!
def map_socket_to_seqres(pdb_chain, socket_data):
"""
Calculate mapping from the SOCKET (structural data) to SEQRES (sequence data)
"""
pdb_id, chain_id = pdb_chain.split('_')
try:
cc_residues = {str(res): 1 for cc in socket_data.values()
for indice in cc['indices'].values()
for res in range(int(indice['start']), int(indice['end'])+1)
if indice['chain'] == chain_id}
cc_heptads = {str(res): hept for cc in socket_data.values()
for indice, heptads in zip(cc['indices'].values(), cc['heptads'].values())
for res, hept in zip(range(int(indice['start']), int(indice['end'])+1), heptads)
if indice['chain'] == chain_id}
except KeyError:
cc_residues = {}
cc_heptads = {}
try:
presence_mapping = lpdb.map_pdb_feat_to_seqres(cc_residues, pdb_chain)
heptad_mapping = lpdb.map_pdb_feat_to_seqres(cc_heptads, pdb_chain)
except ValueError:
return ['0', '0']
return (''.join([str(feat) for feat in presence_mapping]),
''.join([str(feat) for feat in heptad_mapping]))
def map_socket_to_seqres(pdb_chain, socket_data):
"""
Calculate mapping from the SOCKET (structural data) to SEQRES (sequence data)
"""
pdb_id, chain_id = pdb_chain.split('_')
try:
cc_residues = {str(res): 1 for cc in socket_data.values()
for indice in cc['indices'].values()
for res in range(int(indice['start']), int(indice['end'])+1)
if indice['chain'] == chain_id}
cc_heptads = {str(res): hept for cc in socket_data.values()
for indice, heptads in zip(cc['indices'].values(), cc['heptads'].values())
for res, hept in zip(range(int(indice['start']), int(indice['end'])+1), heptads)
if indice['chain'] == chain_id}
except KeyError:
cc_residues = {}
cc_heptads = {}
try:
presence_mapping = lpdb.map_pdb_feat_to_seqres(cc_residues, pdb_chain)
heptad_mapping = lpdb.map_pdb_feat_to_seqres(cc_heptads, pdb_chain)
except ValueError:
return ['0', '0']
return (''.join([str(feat) for feat in presence_mapping]),
''.join([str(feat) for feat in heptad_mapping]))
In [3]:
Copied!
def corr_seq(seq):
"""
Corrects seqres sequence to map non-std residues to 'X' - this fixes biopython aligner weird behaviour
:param seq: input sequence
:return: corrected sequence with non-std residues changed to 'X'
"""
letters = set(list('ACDEFGHIKLMNPQRSTVWYX'))
corr_seq = ''.join([aa if aa in letters else 'X' for aa in seq])
return corr_seq
def corr_seq(seq):
"""
Corrects seqres sequence to map non-std residues to 'X' - this fixes biopython aligner weird behaviour
:param seq: input sequence
:return: corrected sequence with non-std residues changed to 'X'
"""
letters = set(list('ACDEFGHIKLMNPQRSTVWYX'))
corr_seq = ''.join([aa if aa in letters else 'X' for aa in seq])
return corr_seq
Load localpdb and plugins¶
In [4]:
Copied!
lpdb = PDB(db_path='/home/db/localpdb', plugins=['Socket', 'PDBSeqresMapper', 'PDBClustering'], version=20210716)
lpdb = PDB(db_path='/home/db/localpdb', plugins=['Socket', 'PDBSeqresMapper', 'PDBClustering'], version=20210716)
In [5]:
Copied!
lpdb.load_clustering_data(redundancy='100')
lpdb.load_clustering_data(redundancy='50')
print(f'Loaded localpdb with {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
lpdb.load_clustering_data(redundancy='100')
lpdb.load_clustering_data(redundancy='50')
print(f'Loaded localpdb with {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
Loaded localpdb with 176175 structures and 603869 chains
Create datasets¶
Load test set data used in the original DeepCoil paper¶
In [6]:
Copied!
df_test1 = pd.read_csv('data/raw/deepcoil_test1.csv', index_col=0)
df_test2 = pd.read_csv('data/raw/deepcoil_test2.csv', index_col=0)
test1_idx = set(df_test1.index)
test2_idx = set(df_test2.index)
df_test1 = pd.read_csv('data/raw/deepcoil_test1.csv', index_col=0)
df_test2 = pd.read_csv('data/raw/deepcoil_test2.csv', index_col=0)
test1_idx = set(df_test1.index)
test2_idx = set(df_test2.index)
Check if all entries from both DeepCoil test sets are still in PDB¶
In [7]:
Copied!
diff_idx1 = test1_idx - set(lpdb.chains.index)
diff_idx2 = test2_idx - set(lpdb.chains.index)
if len(diff_idx1) > 0:
print(f'Removing {len(diff_idx1)} entries from test_set1 - no longer available in PDB\n')
test1_idx -= diff_idx1
if len(diff_idx2) > 0:
print(f'Removing {len(diff_idx2)} entries from test_set2 - no longer available in PDB\n')
test2_idx -= diff_idx2
test_idx = test1_idx | test2_idx
test_pdb = {pdb_chain.split('_')[0] for pdb_chain in test_idx}
diff_idx1 = test1_idx - set(lpdb.chains.index)
diff_idx2 = test2_idx - set(lpdb.chains.index)
if len(diff_idx1) > 0:
print(f'Removing {len(diff_idx1)} entries from test_set1 - no longer available in PDB\n')
test1_idx -= diff_idx1
if len(diff_idx2) > 0:
print(f'Removing {len(diff_idx2)} entries from test_set2 - no longer available in PDB\n')
test2_idx -= diff_idx2
test_idx = test1_idx | test2_idx
test_pdb = {pdb_chain.split('_')[0] for pdb_chain in test_idx}
Removing 1 entries from test_set1 - no longer available in PDB
Filter out NMR structures, structures with resolution below 4 angstroms, and with sequence length below 20¶
In [8]:
Copied!
lpdb.entries = lpdb.entries[lpdb.entries['method'] != 'NMR']
lpdb.entries = lpdb.entries[lpdb.entries['resolution'] <= 4.0]
lpdb.chains = lpdb.chains[lpdb.chains['sequence'].str.len() >= 20]
lpdb.chains = lpdb.chains[lpdb.chains.index.isin(list(lpdb._mapping_dict.keys()))]
print('Filtering by method, resolution and sequence length...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
lpdb.entries = lpdb.entries[lpdb.entries['method'] != 'NMR']
lpdb.entries = lpdb.entries[lpdb.entries['resolution'] <= 4.0]
lpdb.chains = lpdb.chains[lpdb.chains['sequence'].str.len() >= 20]
lpdb.chains = lpdb.chains[lpdb.chains.index.isin(list(lpdb._mapping_dict.keys()))]
print('Filtering by method, resolution and sequence length...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
Filtering by method, resolution and sequence length... Remaining: 150623 structures and 386570 chains
Check if all entries from both DeepCoil test sets have PDB<->SEQRES mapping¶
In [9]:
Copied!
diff_idx1 = test1_idx - set(lpdb.chains.index)
diff_idx2 = test2_idx - set(lpdb.chains.index)
if len(diff_idx1) > 0:
print(f'Removing {len(diff_idx1)} entries from test_set1 - no valid mapping\n')
test1_idx -= diff_idx1
if len(diff_idx2) > 0:
print(f'Removing {len(diff_idx2)} entries from test_set2 - no valid mapping\n')
test2_idx -= diff_idx2
test_idx = test1_idx | test2_idx
test_pdb = {pdb_chain.split('_')[0] for pdb_chain in test_idx}
diff_idx1 = test1_idx - set(lpdb.chains.index)
diff_idx2 = test2_idx - set(lpdb.chains.index)
if len(diff_idx1) > 0:
print(f'Removing {len(diff_idx1)} entries from test_set1 - no valid mapping\n')
test1_idx -= diff_idx1
if len(diff_idx2) > 0:
print(f'Removing {len(diff_idx2)} entries from test_set2 - no valid mapping\n')
test2_idx -= diff_idx2
test_idx = test1_idx | test2_idx
test_pdb = {pdb_chain.split('_')[0] for pdb_chain in test_idx}
Removing 54 entries from test_set1 - no valid mapping Removing 9 entries from test_set2 - no valid mapping
Filter out identical sequences (select representative with best resolution) - leave entries from the test sets¶
In [10]:
Copied!
nr_idx = set(lpdb.chains.groupby(by='clust-100')['resolution'].idxmin())
lpdb.chains = lpdb.chains.loc[(nr_idx | test_idx)]
lpdb.chains = lpdb.chains.sort_index()
print('Filtering identical entries...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
nr_idx = set(lpdb.chains.groupby(by='clust-100')['resolution'].idxmin())
lpdb.chains = lpdb.chains.loc[(nr_idx | test_idx)]
lpdb.chains = lpdb.chains.sort_index()
print('Filtering identical entries...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
Filtering identical entries... Remaining: 70447 structures and 78537 chains
Label residues based on the SOCKET data¶
In [11]:
Copied!
socket_data_74_overlap = lpdb.get_socket_dict(cutoff='7.4', method='overlap')
socket_data_74_overlap = lpdb.get_socket_dict(cutoff='7.4', method='overlap')
In [12]:
Copied!
socket_74_labels = {pdb_chain: map_socket_to_seqres(pdb_chain, socket_data_74_overlap.get(pdb_chain.split('_')[0], {}))
for pdb_chain in lpdb.chains.index}
socket_74_labels = {pdb_chain: map_socket_to_seqres(pdb_chain, socket_data_74_overlap.get(pdb_chain.split('_')[0], {}))
for pdb_chain in lpdb.chains.index}
In [13]:
Copied!
lpdb.chains['socket_74_label'] = lpdb.chains.index.map(lambda x: socket_74_labels[x][0])
lpdb.chains['socket_74_heptads'] = lpdb.chains.index.map(lambda x: socket_74_labels[x][1])
lpdb.chains['socket_74_label'] = lpdb.chains.index.map(lambda x: socket_74_labels[x][0])
lpdb.chains['socket_74_heptads'] = lpdb.chains.index.map(lambda x: socket_74_labels[x][1])
Dump updated test sets to CSV¶
In [14]:
Copied!
df_test1 = pd.merge(df_test1, lpdb.chains.loc[test1_idx, ['socket_74_label', 'socket_74_heptads']],
left_index=True, right_index=True)
df_test2 = pd.merge(df_test2, lpdb.chains.loc[test2_idx, ['socket_74_label', 'socket_74_heptads']],
left_index=True, right_index=True)
df_test1 = pd.merge(df_test1, lpdb.chains.loc[test1_idx, ['socket_74_label', 'socket_74_heptads']],
left_index=True, right_index=True)
df_test2 = pd.merge(df_test2, lpdb.chains.loc[test2_idx, ['socket_74_label', 'socket_74_heptads']],
left_index=True, right_index=True)
In [15]:
Copied!
print('Saving updated test set CSVs')
df_test1.to_csv('data/dc-test1.csv')
df_test2.to_csv('data/dc-test2.csv')
print('Saving updated test set CSVs')
df_test1.to_csv('data/dc-test1.csv')
df_test2.to_csv('data/dc-test2.csv')
Saving updated test set CSVs
Generate blast db from test set entries to query with the remaining chains¶
In [16]:
Copied!
f = open('calc/psiblast/dc-db.fas', 'w')
for id_, seq in lpdb.chains.loc[test_idx].sequence.iteritems():
f.write(f'>{id_}\n{seq}\n')
f.close()
f = open('calc/psiblast/dc-db.fas', 'w')
for id_, seq in lpdb.chains.loc[test_idx].sequence.iteritems():
f.write(f'>{id_}\n{seq}\n')
f.close()
In [17]:
Copied!
os.chdir('calc/psiblast/')
os.system(f'{NCBI_PATH}/makeblastdb -in dc-db.fas -dbtype prot')
os.chdir('./../../')
os.chdir('calc/psiblast/')
os.system(f'{NCBI_PATH}/makeblastdb -in dc-db.fas -dbtype prot')
os.chdir('./../../')
Remove test set entries from lpdb¶
In [18]:
Copied!
lpdb.chains = lpdb.chains[~lpdb.chains.index.isin(test_idx)]
lpdb.chains = lpdb.chains[~lpdb.chains.index.isin(test_idx)]
Query the rest of the sequences against the test set and select only those which are below 30% similarity¶
In [19]:
Copied!
f = open('calc/psiblast/dc-query.fas', 'w')
for id_, seq in lpdb.chains.sequence.iteritems():
f.write(f'>{id_}\n{seq}\n')
f.close()
f = open('calc/psiblast/dc-query.fas', 'w')
for id_, seq in lpdb.chains.sequence.iteritems():
f.write(f'>{id_}\n{seq}\n')
f.close()
In [20]:
Copied!
print("Calculating similarity of the PDB sequences against the test sequences...\n")
os.chdir('calc/psiblast/')
os.system(f'{NCBI_PATH}/psiblast -query dc-query.fas -db dc-db.fas -outfmt "6 qseqid sseqid pident qcovs evalue" ' \
f'-evalue 1e-2 -num_threads {NP} -max_target_seqs 2 > dc-query.csv')
os.chdir('./../../')
print("Calculating similarity of the PDB sequences against the test sequences...\n")
os.chdir('calc/psiblast/')
os.system(f'{NCBI_PATH}/psiblast -query dc-query.fas -db dc-db.fas -outfmt "6 qseqid sseqid pident qcovs evalue" ' \
f'-evalue 1e-2 -num_threads {NP} -max_target_seqs 2 > dc-query.csv')
os.chdir('./../../')
Calculating similarity of the PDB sequences against the test sequences...
In [21]:
Copied!
redundant_df = pd.read_csv('calc/psiblast/dc-query.csv', sep='\s+', header=None,
names=['qid','sid','ident', 'cov','evalue'])
redundant_df['w_ident'] = redundant_df['ident'] * redundant_df['cov'] / 100
redundant_idx = set(redundant_df[(redundant_df['ident'] > 30) & (redundant_df['cov'] > 50)].qid.values)
lpdb.chains = lpdb.chains[~lpdb.chains.index.isin(redundant_idx)]
redundant_df = pd.read_csv('calc/psiblast/dc-query.csv', sep='\s+', header=None,
names=['qid','sid','ident', 'cov','evalue'])
redundant_df['w_ident'] = redundant_df['ident'] * redundant_df['cov'] / 100
redundant_idx = set(redundant_df[(redundant_df['ident'] > 30) & (redundant_df['cov'] > 50)].qid.values)
lpdb.chains = lpdb.chains[~lpdb.chains.index.isin(redundant_idx)]
In [22]:
Copied!
print('Removing entries similar to entries in DeepCoil test set...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
print('Removing entries similar to entries in DeepCoil test set...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
Removing entries similar to entries in DeepCoil test set... Remaining: 66535 structures and 74076 chains
Limit sequence redundancy to 50% in train set, select cluster representatives with highest CC residues fraction¶
In [23]:
Copied!
lpdb.chains['cc_frac'] = lpdb.chains['socket_74_label'].apply(lambda x: 1 - x.count('0')/len(x))
lpdb.chains['cc_frac'] = lpdb.chains['socket_74_label'].apply(lambda x: 1 - x.count('0')/len(x))
In [24]:
Copied!
idx = lpdb.chains.groupby(by='clust-50')['cc_frac'].idxmax()
lpdb.chains = lpdb.chains.loc[idx]
print('Limiting redundancy to 50%...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
idx = lpdb.chains.groupby(by='clust-50')['cc_frac'].idxmax()
lpdb.chains = lpdb.chains.loc[idx]
print('Limiting redundancy to 50%...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
Limiting redundancy to 50%... Remaining: 32390 structures and 35189 chains
In [25]:
Copied!
pos_entries = len(lpdb.chains[lpdb.chains['cc_frac'] > 0])
pos_entries_frac = len(lpdb.chains[lpdb.chains['cc_frac'] > 0]) / len(lpdb.chains)
print(f'Positive entries: {pos_entries}, frac={pos_entries_frac}')
pos_entries = len(lpdb.chains[lpdb.chains['cc_frac'] > 0])
pos_entries_frac = len(lpdb.chains[lpdb.chains['cc_frac'] > 0]) / len(lpdb.chains)
print(f'Positive entries: {pos_entries}, frac={pos_entries_frac}')
Positive entries: 4019, frac=0.11421182755974879
In [26]:
Copied!
tokens = [int(label) for entry in lpdb.chains.socket_74_label.str.cat() for label in entry]
print(f'Positive tokens: {tokens.count(1)}, frac={tokens.count(1)/len(tokens)}\n')
tokens = [int(label) for entry in lpdb.chains.socket_74_label.str.cat() for label in entry]
print(f'Positive tokens: {tokens.count(1)}, frac={tokens.count(1)/len(tokens)}\n')
Positive tokens: 181723, frac=0.018116988833092568
Correct noncanonical aa's in sequences¶
In [27]:
Copied!
lpdb.chains['sequence'] = lpdb.chains['sequence'].apply(corr_seq)
lpdb.chains['sequence'] = lpdb.chains['sequence'].apply(corr_seq)
Save to CSV¶
In [28]:
Copied!
lpdb.chains.to_csv('data/dc-train.csv')
print('DONE!')
lpdb.chains.to_csv('data/dc-train.csv')
print('DONE!')
DONE!