Building a dataset for machine learning

Use localpdb to create a dataset for machine learning purpose¶

Within this notebook we demonstrate the procedures used to derive the dataset used to train a coiled-coil domain prediction method - DeepCoil (Repository, Paper). The steps described here can be easily adapted to any other task of interest, e.g. secondary structure prediction, other motif detection etc.

Imports and definitions¶

In [1]:

import pandas as pd
import os
from localpdb import PDB


NCBI_PATH='/opt/apps/ncbi-blast+/bin/'
NP = 16
import pandas as pd
import os
from localpdb import PDB


NCBI_PATH='/opt/apps/ncbi-blast+/bin/'
NP = 16

Helper functions¶

In [2]:

def map_socket_to_seqres(pdb_chain, socket_data):
    """
    Calculate mapping from the SOCKET (structural data) to SEQRES (sequence data)
    """
    pdb_id, chain_id = pdb_chain.split('_')
    try:
        cc_residues = {str(res): 1 for cc in socket_data.values() 
                       for indice in cc['indices'].values() 
                       for res in range(int(indice['start']), int(indice['end'])+1) 
                       if indice['chain'] == chain_id}
        
        cc_heptads = {str(res): hept for cc in socket_data.values() 
                      for indice, heptads in zip(cc['indices'].values(), cc['heptads'].values()) 
                      for res, hept in zip(range(int(indice['start']), int(indice['end'])+1), heptads) 
                      if indice['chain'] == chain_id}
    except KeyError:
        cc_residues = {}
        cc_heptads = {}
    try:
        presence_mapping = lpdb.map_pdb_feat_to_seqres(cc_residues, pdb_chain)
        heptad_mapping = lpdb.map_pdb_feat_to_seqres(cc_heptads, pdb_chain)
    except ValueError:
        return ['0', '0']
    return (''.join([str(feat) for feat in presence_mapping]), 
            ''.join([str(feat) for feat in heptad_mapping]))
def map_socket_to_seqres(pdb_chain, socket_data):
    """
    Calculate mapping from the SOCKET (structural data) to SEQRES (sequence data)
    """
    pdb_id, chain_id = pdb_chain.split('_')
    try:
        cc_residues = {str(res): 1 for cc in socket_data.values() 
                       for indice in cc['indices'].values() 
                       for res in range(int(indice['start']), int(indice['end'])+1) 
                       if indice['chain'] == chain_id}
        
        cc_heptads = {str(res): hept for cc in socket_data.values() 
                      for indice, heptads in zip(cc['indices'].values(), cc['heptads'].values()) 
                      for res, hept in zip(range(int(indice['start']), int(indice['end'])+1), heptads) 
                      if indice['chain'] == chain_id}
    except KeyError:
        cc_residues = {}
        cc_heptads = {}
    try:
        presence_mapping = lpdb.map_pdb_feat_to_seqres(cc_residues, pdb_chain)
        heptad_mapping = lpdb.map_pdb_feat_to_seqres(cc_heptads, pdb_chain)
    except ValueError:
        return ['0', '0']
    return (''.join([str(feat) for feat in presence_mapping]), 
            ''.join([str(feat) for feat in heptad_mapping]))

In [3]:

def corr_seq(seq):
    """
    Corrects seqres sequence to map non-std residues to 'X' - this fixes biopython aligner weird behaviour
    :param seq: input sequence
    :return: corrected sequence with non-std residues changed to 'X'
    """
    letters = set(list('ACDEFGHIKLMNPQRSTVWYX'))
    corr_seq = ''.join([aa if aa in letters else 'X' for aa in seq])
    return corr_seq
def corr_seq(seq):
    """
    Corrects seqres sequence to map non-std residues to 'X' - this fixes biopython aligner weird behaviour
    :param seq: input sequence
    :return: corrected sequence with non-std residues changed to 'X'
    """
    letters = set(list('ACDEFGHIKLMNPQRSTVWYX'))
    corr_seq = ''.join([aa if aa in letters else 'X' for aa in seq])
    return corr_seq

Load localpdb and plugins¶

In [4]:

lpdb = PDB(db_path='/home/db/localpdb', plugins=['Socket', 'PDBSeqresMapper', 'PDBClustering'], version=20210716)
lpdb = PDB(db_path='/home/db/localpdb', plugins=['Socket', 'PDBSeqresMapper', 'PDBClustering'], version=20210716)

In [5]:

lpdb.load_clustering_data(redundancy='100')
lpdb.load_clustering_data(redundancy='50')
print(f'Loaded localpdb with {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
lpdb.load_clustering_data(redundancy='100')
lpdb.load_clustering_data(redundancy='50')
print(f'Loaded localpdb with {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')

Loaded localpdb with 176175 structures and 603869 chains

Create datasets¶

Load test set data used in the original DeepCoil paper¶

In [6]:

df_test1 = pd.read_csv('data/raw/deepcoil_test1.csv', index_col=0)
df_test2 = pd.read_csv('data/raw/deepcoil_test2.csv', index_col=0)
test1_idx = set(df_test1.index)
test2_idx = set(df_test2.index)
df_test1 = pd.read_csv('data/raw/deepcoil_test1.csv', index_col=0)
df_test2 = pd.read_csv('data/raw/deepcoil_test2.csv', index_col=0)
test1_idx = set(df_test1.index)
test2_idx = set(df_test2.index)

Check if all entries from both DeepCoil test sets are still in PDB¶

In [7]:

diff_idx1 = test1_idx - set(lpdb.chains.index)
diff_idx2 = test2_idx - set(lpdb.chains.index)

if len(diff_idx1) > 0:
    print(f'Removing {len(diff_idx1)} entries from test_set1 - no longer available in PDB\n')
    test1_idx -= diff_idx1

if len(diff_idx2) > 0:
    print(f'Removing {len(diff_idx2)} entries from test_set2 - no longer available in PDB\n')
    test2_idx -= diff_idx2
    
test_idx = test1_idx | test2_idx
test_pdb = {pdb_chain.split('_')[0] for pdb_chain in test_idx}
diff_idx1 = test1_idx - set(lpdb.chains.index)
diff_idx2 = test2_idx - set(lpdb.chains.index)

if len(diff_idx1) > 0:
    print(f'Removing {len(diff_idx1)} entries from test_set1 - no longer available in PDB\n')
    test1_idx -= diff_idx1

if len(diff_idx2) > 0:
    print(f'Removing {len(diff_idx2)} entries from test_set2 - no longer available in PDB\n')
    test2_idx -= diff_idx2
    
test_idx = test1_idx | test2_idx
test_pdb = {pdb_chain.split('_')[0] for pdb_chain in test_idx}

Removing 1 entries from test_set1 - no longer available in PDB

Filter out NMR structures, structures with resolution below 4 angstroms, and with sequence length below 20¶

In [8]:

lpdb.entries = lpdb.entries[lpdb.entries['method'] != 'NMR']
lpdb.entries = lpdb.entries[lpdb.entries['resolution'] <= 4.0]
lpdb.chains = lpdb.chains[lpdb.chains['sequence'].str.len() >= 20]
lpdb.chains = lpdb.chains[lpdb.chains.index.isin(list(lpdb._mapping_dict.keys()))]

print('Filtering by method, resolution and sequence length...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
lpdb.entries = lpdb.entries[lpdb.entries['method'] != 'NMR']
lpdb.entries = lpdb.entries[lpdb.entries['resolution'] <= 4.0]
lpdb.chains = lpdb.chains[lpdb.chains['sequence'].str.len() >= 20]
lpdb.chains = lpdb.chains[lpdb.chains.index.isin(list(lpdb._mapping_dict.keys()))]

print('Filtering by method, resolution and sequence length...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')

Filtering by method, resolution and sequence length...
Remaining: 150623 structures and 386570 chains

Check if all entries from both DeepCoil test sets have PDB<->SEQRES mapping¶

In [9]:

diff_idx1 = test1_idx - set(lpdb.chains.index)
diff_idx2 = test2_idx - set(lpdb.chains.index)

if len(diff_idx1) > 0:
    print(f'Removing {len(diff_idx1)} entries from test_set1 - no valid mapping\n')
    test1_idx -= diff_idx1

if len(diff_idx2) > 0:
    print(f'Removing {len(diff_idx2)} entries from test_set2 - no valid mapping\n')
    test2_idx -= diff_idx2
    
test_idx = test1_idx | test2_idx
test_pdb = {pdb_chain.split('_')[0] for pdb_chain in test_idx}
diff_idx1 = test1_idx - set(lpdb.chains.index)
diff_idx2 = test2_idx - set(lpdb.chains.index)

if len(diff_idx1) > 0:
    print(f'Removing {len(diff_idx1)} entries from test_set1 - no valid mapping\n')
    test1_idx -= diff_idx1

if len(diff_idx2) > 0:
    print(f'Removing {len(diff_idx2)} entries from test_set2 - no valid mapping\n')
    test2_idx -= diff_idx2
    
test_idx = test1_idx | test2_idx
test_pdb = {pdb_chain.split('_')[0] for pdb_chain in test_idx}

Removing 54 entries from test_set1 - no valid mapping

Removing 9 entries from test_set2 - no valid mapping

Filter out identical sequences (select representative with best resolution) - leave entries from the test sets¶

In [10]:

nr_idx = set(lpdb.chains.groupby(by='clust-100')['resolution'].idxmin())
lpdb.chains = lpdb.chains.loc[(nr_idx | test_idx)]
lpdb.chains = lpdb.chains.sort_index()

print('Filtering identical entries...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
nr_idx = set(lpdb.chains.groupby(by='clust-100')['resolution'].idxmin())
lpdb.chains = lpdb.chains.loc[(nr_idx | test_idx)]
lpdb.chains = lpdb.chains.sort_index()

print('Filtering identical entries...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')

Filtering identical entries...
Remaining: 70447 structures and 78537 chains

Label residues based on the SOCKET data¶

In [11]:

socket_data_74_overlap = lpdb.get_socket_dict(cutoff='7.4', method='overlap')
socket_data_74_overlap = lpdb.get_socket_dict(cutoff='7.4', method='overlap')

In [12]:

socket_74_labels = {pdb_chain: map_socket_to_seqres(pdb_chain, socket_data_74_overlap.get(pdb_chain.split('_')[0], {})) 
             for pdb_chain in lpdb.chains.index}
socket_74_labels = {pdb_chain: map_socket_to_seqres(pdb_chain, socket_data_74_overlap.get(pdb_chain.split('_')[0], {})) 
             for pdb_chain in lpdb.chains.index}

In [13]:

lpdb.chains['socket_74_label'] = lpdb.chains.index.map(lambda x: socket_74_labels[x][0])
lpdb.chains['socket_74_heptads'] = lpdb.chains.index.map(lambda x: socket_74_labels[x][1])
lpdb.chains['socket_74_label'] = lpdb.chains.index.map(lambda x: socket_74_labels[x][0])
lpdb.chains['socket_74_heptads'] = lpdb.chains.index.map(lambda x: socket_74_labels[x][1])

Dump updated test sets to CSV¶

In [14]:

df_test1 = pd.merge(df_test1, lpdb.chains.loc[test1_idx, ['socket_74_label', 'socket_74_heptads']],
                    left_index=True, right_index=True)

df_test2 = pd.merge(df_test2, lpdb.chains.loc[test2_idx, ['socket_74_label', 'socket_74_heptads']],
                    left_index=True, right_index=True)
df_test1 = pd.merge(df_test1, lpdb.chains.loc[test1_idx, ['socket_74_label', 'socket_74_heptads']],
                    left_index=True, right_index=True)

df_test2 = pd.merge(df_test2, lpdb.chains.loc[test2_idx, ['socket_74_label', 'socket_74_heptads']],
                    left_index=True, right_index=True)

In [15]:

print('Saving updated test set CSVs')
df_test1.to_csv('data/dc-test1.csv')
df_test2.to_csv('data/dc-test2.csv')
print('Saving updated test set CSVs')
df_test1.to_csv('data/dc-test1.csv')
df_test2.to_csv('data/dc-test2.csv')

Saving updated test set CSVs

Generate blast db from test set entries to query with the remaining chains¶

In [16]:

f = open('calc/psiblast/dc-db.fas', 'w')
for id_, seq in lpdb.chains.loc[test_idx].sequence.iteritems():
    f.write(f'>{id_}\n{seq}\n')
f.close()
f = open('calc/psiblast/dc-db.fas', 'w')
for id_, seq in lpdb.chains.loc[test_idx].sequence.iteritems():
    f.write(f'>{id_}\n{seq}\n')
f.close()

In [17]:

os.chdir('calc/psiblast/')
os.system(f'{NCBI_PATH}/makeblastdb -in dc-db.fas -dbtype prot')
os.chdir('./../../')
os.chdir('calc/psiblast/')
os.system(f'{NCBI_PATH}/makeblastdb -in dc-db.fas -dbtype prot')
os.chdir('./../../')

Remove test set entries from lpdb¶

In [18]:

lpdb.chains = lpdb.chains[~lpdb.chains.index.isin(test_idx)]
lpdb.chains = lpdb.chains[~lpdb.chains.index.isin(test_idx)]

Query the rest of the sequences against the test set and select only those which are below 30% similarity¶

In [19]:

f = open('calc/psiblast/dc-query.fas', 'w')
for id_, seq in lpdb.chains.sequence.iteritems():
    f.write(f'>{id_}\n{seq}\n')
f.close()
f = open('calc/psiblast/dc-query.fas', 'w')
for id_, seq in lpdb.chains.sequence.iteritems():
    f.write(f'>{id_}\n{seq}\n')
f.close()

In [20]:

print("Calculating similarity of the PDB sequences against the test sequences...\n")
os.chdir('calc/psiblast/')
os.system(f'{NCBI_PATH}/psiblast -query dc-query.fas -db dc-db.fas -outfmt "6 qseqid sseqid pident qcovs evalue" ' \
          f'-evalue 1e-2 -num_threads {NP} -max_target_seqs 2 > dc-query.csv')
os.chdir('./../../')
print("Calculating similarity of the PDB sequences against the test sequences...\n")
os.chdir('calc/psiblast/')
os.system(f'{NCBI_PATH}/psiblast -query dc-query.fas -db dc-db.fas -outfmt "6 qseqid sseqid pident qcovs evalue" ' \
          f'-evalue 1e-2 -num_threads {NP} -max_target_seqs 2 > dc-query.csv')
os.chdir('./../../')

Calculating similarity of the PDB sequences against the test sequences...

In [21]:

redundant_df = pd.read_csv('calc/psiblast/dc-query.csv', sep='\s+', header=None, 
                           names=['qid','sid','ident', 'cov','evalue'])
redundant_df['w_ident'] = redundant_df['ident'] * redundant_df['cov'] / 100
redundant_idx = set(redundant_df[(redundant_df['ident'] > 30) & (redundant_df['cov'] > 50)].qid.values)
lpdb.chains = lpdb.chains[~lpdb.chains.index.isin(redundant_idx)]
redundant_df = pd.read_csv('calc/psiblast/dc-query.csv', sep='\s+', header=None, 
                           names=['qid','sid','ident', 'cov','evalue'])
redundant_df['w_ident'] = redundant_df['ident'] * redundant_df['cov'] / 100
redundant_idx = set(redundant_df[(redundant_df['ident'] > 30) & (redundant_df['cov'] > 50)].qid.values)
lpdb.chains = lpdb.chains[~lpdb.chains.index.isin(redundant_idx)]

In [22]:

print('Removing entries similar to entries in DeepCoil test set...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
print('Removing entries similar to entries in DeepCoil test set...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')

Removing entries similar to entries in DeepCoil test set...
Remaining: 66535 structures and 74076 chains

Limit sequence redundancy to 50% in train set, select cluster representatives with highest CC residues fraction¶

In [23]:

lpdb.chains['cc_frac'] = lpdb.chains['socket_74_label'].apply(lambda x: 1 - x.count('0')/len(x))
lpdb.chains['cc_frac'] = lpdb.chains['socket_74_label'].apply(lambda x: 1 - x.count('0')/len(x))

In [24]:

idx = lpdb.chains.groupby(by='clust-50')['cc_frac'].idxmax()
lpdb.chains = lpdb.chains.loc[idx]
print('Limiting redundancy to 50%...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
idx = lpdb.chains.groupby(by='clust-50')['cc_frac'].idxmax()
lpdb.chains = lpdb.chains.loc[idx]
print('Limiting redundancy to 50%...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')

Limiting redundancy to 50%...
Remaining: 32390 structures and 35189 chains

In [25]:

pos_entries = len(lpdb.chains[lpdb.chains['cc_frac'] > 0])
pos_entries_frac = len(lpdb.chains[lpdb.chains['cc_frac'] > 0]) / len(lpdb.chains)

print(f'Positive entries: {pos_entries}, frac={pos_entries_frac}')
pos_entries = len(lpdb.chains[lpdb.chains['cc_frac'] > 0])
pos_entries_frac = len(lpdb.chains[lpdb.chains['cc_frac'] > 0]) / len(lpdb.chains)

print(f'Positive entries: {pos_entries}, frac={pos_entries_frac}')

Positive entries: 4019, frac=0.11421182755974879

In [26]:

tokens = [int(label) for entry in lpdb.chains.socket_74_label.str.cat() for label in entry]
print(f'Positive tokens: {tokens.count(1)}, frac={tokens.count(1)/len(tokens)}\n')
tokens = [int(label) for entry in lpdb.chains.socket_74_label.str.cat() for label in entry]
print(f'Positive tokens: {tokens.count(1)}, frac={tokens.count(1)/len(tokens)}\n')

Positive tokens: 181723, frac=0.018116988833092568

Correct noncanonical aa's in sequences¶

In [27]:

lpdb.chains['sequence'] = lpdb.chains['sequence'].apply(corr_seq)
lpdb.chains['sequence'] = lpdb.chains['sequence'].apply(corr_seq)

Save to CSV¶

In [28]:

lpdb.chains.to_csv('data/dc-train.csv')
print('DONE!')
lpdb.chains.to_csv('data/dc-train.csv')
print('DONE!')

DONE!