Source code for faltwerk.io

from collections import defaultdict
import gzip
from io import StringIO
import json
from pathlib import Path
import tempfile
try:
    from typing import Union
except ImportError:
    from typing_extensions import Union

from Bio import PDB, SeqUtils
from Bio.PDB import PDBIO, Structure
from Bio.PDB.PDBParser import PDBParser
from Bio import SeqIO
import requests
import screed

from faltwerk.external import reindex_pdb
from faltwerk.utils import mean_pairwise_similarity


[docs]def save_pdb(structure: Structure, out: Union[str, Path, StringIO]) -> None:
    '''
    Save structure as pdb file.
    '''
    file = PDBIO()
    file.set_structure(structure)
    
    # Save to stream
    if type(out) == StringIO:
        file.save(out)
        return out

    # Save to file
    else:
        p = Path(out)
        if not p.parent.exists():
            p.parent.mkdir(parents=True)
        file.save(p.resolve().__str__())
        return None


[docs]def load_conserved(fp, ref=None, metric=mean_pairwise_similarity):
    '''
    If no reference sequence name is provided, assume the first sequence is
    the reference. Why do we even need to specify the reference? Bc/ in the MSA
    it can contain gaps, which we'll omit bc/ we want to be able to map the
    conservation values to the protein structure, which does not contain gaps
    and we assume is identical to the reference sequence.

    Available functions:

    - mean_pairwise_similarity
    - entropy
    '''
    variance = defaultdict(list)
    cnt, ix = 0, -1

    with screed.open(fp) as file:
        # If no reference is provided, assume that the first sequence is ref.
        if (not ref) and (cnt == 0):
            ix = cnt

        for i in file:
            if (ref == i.name) and (ix != 0):
                ix = cnt
            
            for pos, aa in enumerate(i.sequence):
                variance[pos].append(aa)
            cnt += 1
        
    l = []
    for pos, residues in variance.items():
        ref_aa = residues[ix]
        if not ref_aa == '-':
            l.append(metric(residues))

    return l


[docs]def is_gz_file(filepath):
    '''
    https://stackoverflow.com/questions/3703276/how-to-tell-if-a-file-is-gzip-compressed
    '''
    with open(filepath, 'rb') as test_f:
        return test_f.read(2) == b'\x1f\x8b'


def read_sequence(fp: Union[str, Path]):
    # https://www.biostars.org/p/435629/
    with open(fp, 'r') as file:
        l = []
        for record in SeqIO.parse(file, 'pdb-atom'):
            l.append(record.seq)
        if len(l) > 1:
            print('Warning: More than one sequence found, returning first')
        return l[0].__str__()


[docs]def read_pdb(fp: Union[str, Path], name: str='x', strict: bool=True, reindex: bool=False, reindex_start_position: int=1) -> Structure:
    '''
    Return structure AND sequence

    https://biopython.org/wiki/The_Biopython_Structural_Bioinformatics_FAQ

    The hierarchy of the PDB parser is this:

    >>> p = PDBParser()
    >>> structure = p.get_structure("X", "pdb1fat.ent")
    >>> for model in structure:
    >>>     for chain in model:
    >>>         for residue in chain:
    >>>             for atom in residue:
    >>>                 print(atom)
    '''
    fp = Path(fp)
    assert fp.exists()

    # print(is_gz_file(fp))
    if is_gz_file(fp):
        tmp = tempfile.NamedTemporaryFile()

        with gzip.open(fp) as file:
            file_content = file.read()
            tmp.write(file_content)
            fp = Path(tmp.name)

    # https://biopython.org/wiki/The_Biopython_Structural_Bioinformatics_FAQ
    pdb_parser = PDB.PDBParser(QUIET=True, PERMISSIVE=0)
    structure = pdb_parser.get_structure(name, str(fp))
    
    if reindex:
        s = stream(structure).split('\n')
        x = '\n'.join(reindex_pdb(s, reindex_start_position))
        pdb_parser = PDB.PDBParser(QUIET=True, PERMISSIVE=0)
        structure = pdb_parser.get_structure('', StringIO(x))

    counts = {
       'models': 0,
       'chains': 0,
    }

    for model in structure:
        counts['models'] += 1
        for chain in model:
            # TODO: If multiple, could return all here
            counts['chains'] += 1

    if strict:
        assert sum(counts.values()) == 2, 'More than one model or chain present'

    try:
        tmp.close()
    except NameError:
        pass

    # return structure, sequence
    return structure


[docs]def load_bfactor_column(fp):
    '''
    Load annotation data stored in the bfactor column of a .pdb file
    '''
    parser = PDBParser()
    structure = parser.get_structure('', fp)
    d = {}

    for res in structure.get_residues():
        for atom in res:
            # ALA > Ala
            x = atom.parent.resname[0] + atom.parent.resname[1:].lower()  
            try:
                aa = SeqUtils.IUPACData.protein_letters_3to1[x]  
                # same value for all atoms in a resudue
            except KeyError:
                continue

            num = atom.full_id[3][1]
            d[num] = [atom.bfactor, aa]
    
    return [i for i, j in d.values()]


[docs]def parse_hyphy(fp, method='meme', direction='positive', skip=[]):
    '''
    ``hyphy`` returns endless files with lots and lots of values (granted, it
    makes the calculations very transparent). Parse them into a human-
    friendly format.

    Optional arguments:

    - skip -- use e.g. to mask gaps in an alignment, needs to be some form of binary iterator (eg [0, 0, 0, 1, 0, 0, 0, ...])
    '''
    with open(fp, 'r') as file:
        d = json.load(file)

    if method == 'meme':
        # p-value
        assert direction == 'positive', 'The MEME method does not estimate negative selection'
        scores = [i[6] for i in d['MLE']['content']['0']]
        
    elif method == 'fubar':
        # posterior probability
        neg, pos = zip(*[i[3:5] for i in d['MLE']['content']['0']])
        if direction == 'positive':
            scores = pos
        elif direction == 'negative':
            scores = neg
        else:
            raise ValueError('Direction must be positive or negative.')

    else:
        raise ValueError('Method not implemented')

    if skip:
        assert len(skip) == len(scores), 'Mask has wrong dimensions'
        scores = [p for p, sk in zip(scores, skip) if not sk]

    return scores


def get_alphafold2_model_from_ena(ID=None, fp=None):
    # eg AF-Q09428-F1
    url = f'https://alphafold.ebi.ac.uk/files/{ID}-model_v3.pdb'
    r = requests.get(url)
    assert r.status_code == 200, 'Download failed'
    pdb = r.text
    if not fp:
        fp = f'{ID}.pdb'
    with open(fp, 'w+') as out:
        out.write(pdb)
    return None


[docs]def load_scores(fp):
    '''
    Expects pLDDT scores as output by ColabFold
    '''
    with open(fp, 'r') as file:
        scores = json.load(file)
    return scores


def stream(structure):
    stream = StringIO()
    return save_pdb(structure, stream).getvalue()
Source code for faltwerk.io

faltwerk

Navigation

Related Topics