Building a dataset for machine learning
Use localpdb to create a dataset for machine learning purpose¶
Within this notebook we demonstrate the procedures used to derive the dataset used to train a coiled-coil domain prediction method - DeepCoil (Repository, Paper). The steps described here can be easily adapted to any other task of interest, e.g. secondary structure prediction, other motif detection etc.
Imports and definitions¶
In [1]:
                Copied!
                
                
            import pandas as pd
import os
from localpdb import PDB
NCBI_PATH='/opt/apps/ncbi-blast+/bin/'
NP = 16
import pandas as pd
import os
from localpdb import PDB
NCBI_PATH='/opt/apps/ncbi-blast+/bin/'
NP = 16
    
        Helper functions¶
In [2]:
                Copied!
                
                
            def map_socket_to_seqres(pdb_chain, socket_data):
    """
    Calculate mapping from the SOCKET (structural data) to SEQRES (sequence data)
    """
    pdb_id, chain_id = pdb_chain.split('_')
    try:
        cc_residues = {str(res): 1 for cc in socket_data.values() 
                       for indice in cc['indices'].values() 
                       for res in range(int(indice['start']), int(indice['end'])+1) 
                       if indice['chain'] == chain_id}
        
        cc_heptads = {str(res): hept for cc in socket_data.values() 
                      for indice, heptads in zip(cc['indices'].values(), cc['heptads'].values()) 
                      for res, hept in zip(range(int(indice['start']), int(indice['end'])+1), heptads) 
                      if indice['chain'] == chain_id}
    except KeyError:
        cc_residues = {}
        cc_heptads = {}
    try:
        presence_mapping = lpdb.map_pdb_feat_to_seqres(cc_residues, pdb_chain)
        heptad_mapping = lpdb.map_pdb_feat_to_seqres(cc_heptads, pdb_chain)
    except ValueError:
        return ['0', '0']
    return (''.join([str(feat) for feat in presence_mapping]), 
            ''.join([str(feat) for feat in heptad_mapping]))
def map_socket_to_seqres(pdb_chain, socket_data):
    """
    Calculate mapping from the SOCKET (structural data) to SEQRES (sequence data)
    """
    pdb_id, chain_id = pdb_chain.split('_')
    try:
        cc_residues = {str(res): 1 for cc in socket_data.values() 
                       for indice in cc['indices'].values() 
                       for res in range(int(indice['start']), int(indice['end'])+1) 
                       if indice['chain'] == chain_id}
        
        cc_heptads = {str(res): hept for cc in socket_data.values() 
                      for indice, heptads in zip(cc['indices'].values(), cc['heptads'].values()) 
                      for res, hept in zip(range(int(indice['start']), int(indice['end'])+1), heptads) 
                      if indice['chain'] == chain_id}
    except KeyError:
        cc_residues = {}
        cc_heptads = {}
    try:
        presence_mapping = lpdb.map_pdb_feat_to_seqres(cc_residues, pdb_chain)
        heptad_mapping = lpdb.map_pdb_feat_to_seqres(cc_heptads, pdb_chain)
    except ValueError:
        return ['0', '0']
    return (''.join([str(feat) for feat in presence_mapping]), 
            ''.join([str(feat) for feat in heptad_mapping]))
    
        In [3]:
                Copied!
                
                
            def corr_seq(seq):
    """
    Corrects seqres sequence to map non-std residues to 'X' - this fixes biopython aligner weird behaviour
    :param seq: input sequence
    :return: corrected sequence with non-std residues changed to 'X'
    """
    letters = set(list('ACDEFGHIKLMNPQRSTVWYX'))
    corr_seq = ''.join([aa if aa in letters else 'X' for aa in seq])
    return corr_seq
def corr_seq(seq):
    """
    Corrects seqres sequence to map non-std residues to 'X' - this fixes biopython aligner weird behaviour
    :param seq: input sequence
    :return: corrected sequence with non-std residues changed to 'X'
    """
    letters = set(list('ACDEFGHIKLMNPQRSTVWYX'))
    corr_seq = ''.join([aa if aa in letters else 'X' for aa in seq])
    return corr_seq
    
        Load localpdb and plugins¶
In [4]:
                Copied!
                
                
            lpdb = PDB(db_path='/home/db/localpdb', plugins=['Socket', 'PDBSeqresMapper', 'PDBClustering'], version=20210716)
lpdb = PDB(db_path='/home/db/localpdb', plugins=['Socket', 'PDBSeqresMapper', 'PDBClustering'], version=20210716)
    
        In [5]:
                Copied!
                
                
            lpdb.load_clustering_data(redundancy='100')
lpdb.load_clustering_data(redundancy='50')
print(f'Loaded localpdb with {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
lpdb.load_clustering_data(redundancy='100')
lpdb.load_clustering_data(redundancy='50')
print(f'Loaded localpdb with {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
    
        Loaded localpdb with 176175 structures and 603869 chains
Create datasets¶
Load test set data used in the original DeepCoil paper¶
In [6]:
                Copied!
                
                
            df_test1 = pd.read_csv('data/raw/deepcoil_test1.csv', index_col=0)
df_test2 = pd.read_csv('data/raw/deepcoil_test2.csv', index_col=0)
test1_idx = set(df_test1.index)
test2_idx = set(df_test2.index)
df_test1 = pd.read_csv('data/raw/deepcoil_test1.csv', index_col=0)
df_test2 = pd.read_csv('data/raw/deepcoil_test2.csv', index_col=0)
test1_idx = set(df_test1.index)
test2_idx = set(df_test2.index)
    
        Check if all entries from both DeepCoil test sets are still in PDB¶
In [7]:
                Copied!
                
                
            diff_idx1 = test1_idx - set(lpdb.chains.index)
diff_idx2 = test2_idx - set(lpdb.chains.index)
if len(diff_idx1) > 0:
    print(f'Removing {len(diff_idx1)} entries from test_set1 - no longer available in PDB\n')
    test1_idx -= diff_idx1
if len(diff_idx2) > 0:
    print(f'Removing {len(diff_idx2)} entries from test_set2 - no longer available in PDB\n')
    test2_idx -= diff_idx2
    
test_idx = test1_idx | test2_idx
test_pdb = {pdb_chain.split('_')[0] for pdb_chain in test_idx}
diff_idx1 = test1_idx - set(lpdb.chains.index)
diff_idx2 = test2_idx - set(lpdb.chains.index)
if len(diff_idx1) > 0:
    print(f'Removing {len(diff_idx1)} entries from test_set1 - no longer available in PDB\n')
    test1_idx -= diff_idx1
if len(diff_idx2) > 0:
    print(f'Removing {len(diff_idx2)} entries from test_set2 - no longer available in PDB\n')
    test2_idx -= diff_idx2
    
test_idx = test1_idx | test2_idx
test_pdb = {pdb_chain.split('_')[0] for pdb_chain in test_idx}
    
        Removing 1 entries from test_set1 - no longer available in PDB
Filter out NMR structures, structures with resolution below 4 angstroms, and with sequence length below 20¶
In [8]:
                Copied!
                
                
            lpdb.entries = lpdb.entries[lpdb.entries['method'] != 'NMR']
lpdb.entries = lpdb.entries[lpdb.entries['resolution'] <= 4.0]
lpdb.chains = lpdb.chains[lpdb.chains['sequence'].str.len() >= 20]
lpdb.chains = lpdb.chains[lpdb.chains.index.isin(list(lpdb._mapping_dict.keys()))]
print('Filtering by method, resolution and sequence length...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
lpdb.entries = lpdb.entries[lpdb.entries['method'] != 'NMR']
lpdb.entries = lpdb.entries[lpdb.entries['resolution'] <= 4.0]
lpdb.chains = lpdb.chains[lpdb.chains['sequence'].str.len() >= 20]
lpdb.chains = lpdb.chains[lpdb.chains.index.isin(list(lpdb._mapping_dict.keys()))]
print('Filtering by method, resolution and sequence length...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
    
        Filtering by method, resolution and sequence length... Remaining: 150623 structures and 386570 chains
Check if all entries from both DeepCoil test sets have PDB<->SEQRES mapping¶
In [9]:
                Copied!
                
                
            diff_idx1 = test1_idx - set(lpdb.chains.index)
diff_idx2 = test2_idx - set(lpdb.chains.index)
if len(diff_idx1) > 0:
    print(f'Removing {len(diff_idx1)} entries from test_set1 - no valid mapping\n')
    test1_idx -= diff_idx1
if len(diff_idx2) > 0:
    print(f'Removing {len(diff_idx2)} entries from test_set2 - no valid mapping\n')
    test2_idx -= diff_idx2
    
test_idx = test1_idx | test2_idx
test_pdb = {pdb_chain.split('_')[0] for pdb_chain in test_idx}
diff_idx1 = test1_idx - set(lpdb.chains.index)
diff_idx2 = test2_idx - set(lpdb.chains.index)
if len(diff_idx1) > 0:
    print(f'Removing {len(diff_idx1)} entries from test_set1 - no valid mapping\n')
    test1_idx -= diff_idx1
if len(diff_idx2) > 0:
    print(f'Removing {len(diff_idx2)} entries from test_set2 - no valid mapping\n')
    test2_idx -= diff_idx2
    
test_idx = test1_idx | test2_idx
test_pdb = {pdb_chain.split('_')[0] for pdb_chain in test_idx}
    
        Removing 54 entries from test_set1 - no valid mapping Removing 9 entries from test_set2 - no valid mapping
Filter out identical sequences (select representative with best resolution) - leave entries from the test sets¶
In [10]:
                Copied!
                
                
            nr_idx = set(lpdb.chains.groupby(by='clust-100')['resolution'].idxmin())
lpdb.chains = lpdb.chains.loc[(nr_idx | test_idx)]
lpdb.chains = lpdb.chains.sort_index()
print('Filtering identical entries...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
nr_idx = set(lpdb.chains.groupby(by='clust-100')['resolution'].idxmin())
lpdb.chains = lpdb.chains.loc[(nr_idx | test_idx)]
lpdb.chains = lpdb.chains.sort_index()
print('Filtering identical entries...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
    
        Filtering identical entries... Remaining: 70447 structures and 78537 chains
Label residues based on the SOCKET data¶
In [11]:
                Copied!
                
                
            socket_data_74_overlap = lpdb.get_socket_dict(cutoff='7.4', method='overlap')
socket_data_74_overlap = lpdb.get_socket_dict(cutoff='7.4', method='overlap')
    
        In [12]:
                Copied!
                
                
            socket_74_labels = {pdb_chain: map_socket_to_seqres(pdb_chain, socket_data_74_overlap.get(pdb_chain.split('_')[0], {})) 
             for pdb_chain in lpdb.chains.index}
socket_74_labels = {pdb_chain: map_socket_to_seqres(pdb_chain, socket_data_74_overlap.get(pdb_chain.split('_')[0], {})) 
             for pdb_chain in lpdb.chains.index}
    
        In [13]:
                Copied!
                
                
            lpdb.chains['socket_74_label'] = lpdb.chains.index.map(lambda x: socket_74_labels[x][0])
lpdb.chains['socket_74_heptads'] = lpdb.chains.index.map(lambda x: socket_74_labels[x][1])
lpdb.chains['socket_74_label'] = lpdb.chains.index.map(lambda x: socket_74_labels[x][0])
lpdb.chains['socket_74_heptads'] = lpdb.chains.index.map(lambda x: socket_74_labels[x][1])
    
        Dump updated test sets to CSV¶
In [14]:
                Copied!
                
                
            df_test1 = pd.merge(df_test1, lpdb.chains.loc[test1_idx, ['socket_74_label', 'socket_74_heptads']],
                    left_index=True, right_index=True)
df_test2 = pd.merge(df_test2, lpdb.chains.loc[test2_idx, ['socket_74_label', 'socket_74_heptads']],
                    left_index=True, right_index=True)
df_test1 = pd.merge(df_test1, lpdb.chains.loc[test1_idx, ['socket_74_label', 'socket_74_heptads']],
                    left_index=True, right_index=True)
df_test2 = pd.merge(df_test2, lpdb.chains.loc[test2_idx, ['socket_74_label', 'socket_74_heptads']],
                    left_index=True, right_index=True)
    
        In [15]:
                Copied!
                
                
            print('Saving updated test set CSVs')
df_test1.to_csv('data/dc-test1.csv')
df_test2.to_csv('data/dc-test2.csv')
print('Saving updated test set CSVs')
df_test1.to_csv('data/dc-test1.csv')
df_test2.to_csv('data/dc-test2.csv')
    
        Saving updated test set CSVs
Generate blast db from test set entries to query with the remaining chains¶
In [16]:
                Copied!
                
                
            f = open('calc/psiblast/dc-db.fas', 'w')
for id_, seq in lpdb.chains.loc[test_idx].sequence.iteritems():
    f.write(f'>{id_}\n{seq}\n')
f.close()
f = open('calc/psiblast/dc-db.fas', 'w')
for id_, seq in lpdb.chains.loc[test_idx].sequence.iteritems():
    f.write(f'>{id_}\n{seq}\n')
f.close()
    
        In [17]:
                Copied!
                
                
            os.chdir('calc/psiblast/')
os.system(f'{NCBI_PATH}/makeblastdb -in dc-db.fas -dbtype prot')
os.chdir('./../../')
os.chdir('calc/psiblast/')
os.system(f'{NCBI_PATH}/makeblastdb -in dc-db.fas -dbtype prot')
os.chdir('./../../')
    
        Remove test set entries from lpdb¶
In [18]:
                Copied!
                
                
            lpdb.chains = lpdb.chains[~lpdb.chains.index.isin(test_idx)]
lpdb.chains = lpdb.chains[~lpdb.chains.index.isin(test_idx)]
    
        Query the rest of the sequences against the test set and select only those which are below 30% similarity¶
In [19]:
                Copied!
                
                
            f = open('calc/psiblast/dc-query.fas', 'w')
for id_, seq in lpdb.chains.sequence.iteritems():
    f.write(f'>{id_}\n{seq}\n')
f.close()
f = open('calc/psiblast/dc-query.fas', 'w')
for id_, seq in lpdb.chains.sequence.iteritems():
    f.write(f'>{id_}\n{seq}\n')
f.close()
    
        In [20]:
                Copied!
                
                
            print("Calculating similarity of the PDB sequences against the test sequences...\n")
os.chdir('calc/psiblast/')
os.system(f'{NCBI_PATH}/psiblast -query dc-query.fas -db dc-db.fas -outfmt "6 qseqid sseqid pident qcovs evalue" ' \
          f'-evalue 1e-2 -num_threads {NP} -max_target_seqs 2 > dc-query.csv')
os.chdir('./../../')
print("Calculating similarity of the PDB sequences against the test sequences...\n")
os.chdir('calc/psiblast/')
os.system(f'{NCBI_PATH}/psiblast -query dc-query.fas -db dc-db.fas -outfmt "6 qseqid sseqid pident qcovs evalue" ' \
          f'-evalue 1e-2 -num_threads {NP} -max_target_seqs 2 > dc-query.csv')
os.chdir('./../../')
    
        Calculating similarity of the PDB sequences against the test sequences...
In [21]:
                Copied!
                
                
            redundant_df = pd.read_csv('calc/psiblast/dc-query.csv', sep='\s+', header=None, 
                           names=['qid','sid','ident', 'cov','evalue'])
redundant_df['w_ident'] = redundant_df['ident'] * redundant_df['cov'] / 100
redundant_idx = set(redundant_df[(redundant_df['ident'] > 30) & (redundant_df['cov'] > 50)].qid.values)
lpdb.chains = lpdb.chains[~lpdb.chains.index.isin(redundant_idx)]
redundant_df = pd.read_csv('calc/psiblast/dc-query.csv', sep='\s+', header=None, 
                           names=['qid','sid','ident', 'cov','evalue'])
redundant_df['w_ident'] = redundant_df['ident'] * redundant_df['cov'] / 100
redundant_idx = set(redundant_df[(redundant_df['ident'] > 30) & (redundant_df['cov'] > 50)].qid.values)
lpdb.chains = lpdb.chains[~lpdb.chains.index.isin(redundant_idx)]
    
        In [22]:
                Copied!
                
                
            print('Removing entries similar to entries in DeepCoil test set...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
print('Removing entries similar to entries in DeepCoil test set...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
    
        Removing entries similar to entries in DeepCoil test set... Remaining: 66535 structures and 74076 chains
Limit sequence redundancy to 50% in train set, select cluster representatives with highest CC residues fraction¶
In [23]:
                Copied!
                
                
            lpdb.chains['cc_frac'] = lpdb.chains['socket_74_label'].apply(lambda x: 1 - x.count('0')/len(x))
lpdb.chains['cc_frac'] = lpdb.chains['socket_74_label'].apply(lambda x: 1 - x.count('0')/len(x))
    
        In [24]:
                Copied!
                
                
            idx = lpdb.chains.groupby(by='clust-50')['cc_frac'].idxmax()
lpdb.chains = lpdb.chains.loc[idx]
print('Limiting redundancy to 50%...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
idx = lpdb.chains.groupby(by='clust-50')['cc_frac'].idxmax()
lpdb.chains = lpdb.chains.loc[idx]
print('Limiting redundancy to 50%...')
print(f'Remaining: {len(lpdb.entries)} structures and {len(lpdb.chains)} chains\n')
    
        Limiting redundancy to 50%... Remaining: 32390 structures and 35189 chains
In [25]:
                Copied!
                
                
            pos_entries = len(lpdb.chains[lpdb.chains['cc_frac'] > 0])
pos_entries_frac = len(lpdb.chains[lpdb.chains['cc_frac'] > 0]) / len(lpdb.chains)
print(f'Positive entries: {pos_entries}, frac={pos_entries_frac}')
pos_entries = len(lpdb.chains[lpdb.chains['cc_frac'] > 0])
pos_entries_frac = len(lpdb.chains[lpdb.chains['cc_frac'] > 0]) / len(lpdb.chains)
print(f'Positive entries: {pos_entries}, frac={pos_entries_frac}')
    
        Positive entries: 4019, frac=0.11421182755974879
In [26]:
                Copied!
                
                
            tokens = [int(label) for entry in lpdb.chains.socket_74_label.str.cat() for label in entry]
print(f'Positive tokens: {tokens.count(1)}, frac={tokens.count(1)/len(tokens)}\n')
tokens = [int(label) for entry in lpdb.chains.socket_74_label.str.cat() for label in entry]
print(f'Positive tokens: {tokens.count(1)}, frac={tokens.count(1)/len(tokens)}\n')
    
        Positive tokens: 181723, frac=0.018116988833092568
Correct noncanonical aa's in sequences¶
In [27]:
                Copied!
                
                
            lpdb.chains['sequence'] = lpdb.chains['sequence'].apply(corr_seq)
lpdb.chains['sequence'] = lpdb.chains['sequence'].apply(corr_seq)
    
        Save to CSV¶
In [28]:
                Copied!
                
                
            lpdb.chains.to_csv('data/dc-train.csv')
print('DONE!')
lpdb.chains.to_csv('data/dc-train.csv')
print('DONE!')
    
        DONE!