from ctypes import *
import os
import logging
squizz = True
try:
libc = cdll.LoadLibrary("libc.so.6")
# For squizz
libbioseq = CDLL("libbioseq.so.0", mode=RTLD_GLOBAL)
libbioali = CDLL("libbioali.so.0", mode=RTLD_GLOBAL)
except Exception:
logging.warn("Squizz libraries could not be loaded, skipping squizz")
squizz = False
SEQFMT_UNKWN = "unknown"
SEQFMT_SPROT = "sprot"
SEQFMT_EMBL = "embl"
SEQFMT_GENBANK = "genbank"
SEQFMT_PIR = "pir"
SEQFMT_NBRF = "nbrf"
SEQFMT_GDE = "gde"
SEQFMT_IG = "ig"
SEQFMT_FASTA = "fasta"
SEQFMT_GCG = "gcg"
SEQFMT_RAW = "raw"
SEQFMT_NONE = "none"
ALIFMT_UNKWN = SEQFMT_UNKWN
ALIFMT_CLUSTAL = "clustal"
ALIFMT_PHYLIPI = "PHYLIPI"
ALIFMT_PHYLIPS = "PHYLIPS"
ALIFMT_FASTA = "FASTA"
ALIFMT_MEGA = "MEGA"
ALIFMT_MSF = "MSF"
ALIFMT_NEXUSI = "NEXUSI"
ALIFMT_STOCK = "STOCKHOLM"
ALIFMT_NONE = SEQFMT_NONE
[docs]class SquizzDetector (object):
'''Detector using Squizz'''
def __init__( self ):
SquizzDetector.seqformats = [ SEQFMT_UNKWN, SEQFMT_SPROT, SEQFMT_EMBL, SEQFMT_GENBANK, SEQFMT_PIR,
SEQFMT_NBRF, SEQFMT_GDE, SEQFMT_IG, SEQFMT_FASTA, SEQFMT_GCG, SEQFMT_RAW,
SEQFMT_NONE ]
SquizzDetector.alignformats = [ ALIFMT_UNKWN, ALIFMT_CLUSTAL, ALIFMT_PHYLIPI, ALIFMT_PHYLIPS,
ALIFMT_FASTA, ALIFMT_MEGA, ALIFMT_MSF, ALIFMT_NEXUSI, ALIFMT_STOCK,
ALIFMT_NONE ]
[docs] def detect(self,filename):
'''Detect method'''
f = libc.fopen(filename, "r")
format = libbioseq.sequence_format(f)
if format == SEQFMT_NONE:
format = libbioali.align_format(f)
libc.fclose(f)
if format == ALIFMT_NONE:
return None
return SquizzDetector.alignformats[format]
else:
libc.fclose(f)
return SquizzDetector.seqformats[format]
[docs]class BioFormat (object):
'''Generic detector for an input sequence. Call all registered detectors until one answers'''
detectors = []
def __init__( self ):
if BioFormat.detectors is None:
BioFormat.detectors = []
self.datatypes_by_extension = {
'ab1' : 'ab1',
'axt' : 'axt',
'bam' : 'bam',
'bed' : 'bed',
'coverage' : 'coverage',
'customtrack' : 'customtrack',
'csfasta' : 'csFasta()',
'fasta' : 'fasta',
'fa' : 'fasta',
'fsa' : 'fasta',
'eland' : 'eland',
'fastq' : 'fastq()',
'fastqsanger' : 'fastqsanger()',
'gbk' : 'genbank',
'gtf' : 'gtf',
'gff' : 'gff',
'gff3' : 'gff3',
'genetrack' : 'genetrack',
'interval' : 'interval',
'laj' : 'laj',
'lav' : 'lav',
'maf' : 'maf',
'pileup' : 'pileup',
'qualsolid' : 'qualsolid',
'qualsolexa' : 'qualsolexa',
'qual454' : 'qual454',
'sam' : 'sam',
'scf' : 'scf',
'sff' : 'sff',
'tabular' : 'tabular',
'taxonomy' : 'taxonomy',
'txt' : 'txt',
'wig' : 'wig',
'xml' : 'xml',
}
self.mimetypes = {
'ab1' : 'application/octet-stream',
'axt' : 'text/plain',
'bam' : 'application/octet-stream',
'bed' : 'text/plain',
'customtrack' : 'text/plain',
'csfasta' : 'text/plain',
'eland' : 'application/octet-stream',
'fasta' : 'text/plain',
'fastq' : 'text/plain',
'fastqsanger' : 'text/plain',
'gtf' : 'text/plain',
'gff' : 'text/plain',
'gff3' : 'text/plain',
'interval' : 'text/plain',
'laj' : 'text/plain',
'lav' : 'text/plain',
'maf' : 'text/plain',
'memexml' : 'application/xml',
'pileup' : 'text/plain',
'qualsolid' : 'text/plain',
'qualsolexa' : 'text/plain',
'qual454' : 'text/plain',
'sam' : 'text/plain',
'scf' : 'application/octet-stream',
'sff' : 'application/octet-stream',
'tabular' : 'text/plain',
'taxonomy' : 'text/plain',
'txt' : 'text/plain',
'wig' : 'text/plain',
'xml' : 'application/xml',
}
@staticmethod
[docs] def register(detector):
'''
Register a format detector
:param detector: detector class
:type detector: Class
:return: format of the file
'''
BioFormat.detectors.append(detector)
[docs] def detect_by_extension(self,filename):
'''
Try to detect the format of the input file based on extension
:params filename: file to detect
:type filename: str
:return: format
'''
name, fileExtension = os.path.splitext(filename)
fileExtension = fileExtension.replace('.','')
if fileExtension in self.datatypes_by_extension:
return self.datatypes_by_extension[fileExtension]
return None
[docs] def detect(self,filename):
'''
Try to detect the format of the input file
:params filename: file to detect
:type filename: str
:return: tuple (format,mimetype)
'''
format = self.detect_by_extension(filename)
if format:
return (format,self.mimetypes[format])
for detector in BioFormat.detectors:
logging.debug("Try detector "+detector.__name__)
curdetector = detector()
format = curdetector.detect(filename)
if format is not None:
mime = 'application/octet-stream'
if format in self.mimetypes:
mime = self.mimetypes[format]
return (format,mime)
return (None,None)
if __name__ == "__main__":
if squizz:
BioFormat.register(SquizzDetector)
detector = BioFormat()
(format,mime) = detector.detect("test.fasta")
print("test.fasta: "+str(format)+" : "+str(mime))
(format,mime) = detector.detect("test.myfasta")
print("test.myfasta: "+str(format)+" : "+str(mime))