从序列中提取基因间区域。有时,阅读基因之间的区域更有趣……。
以下可运行脚本读取一个 genbank 文件,该文件可能是一个基因组或染色体文件。它使用 CDS 特征来发现 ORF 的 5' 和 3' 末端。是的,ORF 与基因并不完全相同,但这是我们采用的方式。此外,如果您也对编码 RNA 的基因感兴趣,您可能需要将 CDS 特征替换为“gene”特征。“intergene_length”变量是基因间区域的最小长度阈值,默认设置为 1。该程序将输出到以“_ign.fasta”为后缀的文件中。该程序根据 genbank 文件的注释输出 + 链或反向互补链。输出以 FASTA 格式,标题包括基因间区域坐标、唯一 ID 以及序列是来自 + 链还是 - 链。
#!/usr/bin/env python
import sys
import Bio
from Bio import SeqIO, SeqFeature
from Bio.SeqRecord import SeqRecord
import os
# Copyright(C) 2009 Iddo Friedberg & Ian MC Fleming
# Released under Biopython license. http://www.biopython.org/DIST/LICENSE
# Do not remove this comment
def get_interregions(genbank_path, intergene_length=1):
seq_record = SeqIO.parse(open(genbank_path), "genbank").next()
cds_list_plus = []
cds_list_minus = []
intergenic_records = []
# Loop over the genome file, get the CDS features on each of the strands
for feature in seq_record.features:
if feature.type == "CDS":
mystart = feature.location._start.position
myend = feature.location._end.position
if feature.strand == -1:
cds_list_minus.append((mystart, myend, -1))
elif feature.strand == 1:
cds_list_plus.append((mystart, myend, 1))
else:
sys.stderr.write(
"No strand indicated %d-%d. Assuming +\n" % (mystart, myend)
)
cds_list_plus.append((mystart, myend, 1))
for i, pospair in enumerate(cds_list_plus[1:]):
# Compare current start position to previous end position
last_end = cds_list_plus[i][1]
this_start = pospair[0]
strand = pospair[2]
if this_start - last_end >= intergene_length:
intergene_seq = seq_record.seq[last_end:this_start]
strand_string = "+"
intergenic_records.append(
SeqRecord(
intergene_seq,
id="%s-ign-%d" % (seq_record.name, i),
description="%s %d-%d %s"
% (seq_record.name, last_end + 1, this_start, strand_string),
)
)
for i, pospair in enumerate(cds_list_minus[1:]):
last_end = cds_list_minus[i][1]
this_start = pospair[0]
strand = pospair[2]
if this_start - last_end >= intergene_length:
intergene_seq = seq_record.seq[last_end:this_start]
strand_string = "-"
intergenic_records.append(
SeqRecord(
intergene_seq,
id="%s-ign-%d" % (seq_record.name, i),
description="%s %d-%d %s"
% (seq_record.name, last_end + 1, this_start, strand_string),
)
)
outpath = os.path.splitext(os.path.basename(genbank_path))[0] + "_ign.fasta"
SeqIO.write(intergenic_records, open(outpath, "w"), "fasta")
if __name__ == "__main__":
if len(sys.argv) == 2:
get_interregions(sys.argv[1])
elif len(sys.argv) == 3:
get_interregions(sys.argv[1], int(sys.argv[2]))
else:
print("Usage: get_intergenic.py gb_file [intergenic_length]")
sys.exit(0)
有关代码的完整解释,请参见此处:bytesizebio.net
./get\_intergene mygenbankfile.gb 1
在 genbank 文件上运行。基因间序列将以 FASTA 格式显示在 mygenbank_ign.fasta 文件中。