cleanUrl: "annotating-bed-file-in-pandas"
description: "Pandas dataframe 형태로 표현된 BED 파일 annotation 하기"

Annotation하고자 하는 interval들을 interval tree에 저장 후 annotation BED 파일을 순회하면서 tree에 query를 던진다.

from intervaltree import IntervalTree
from collections import defaultdict

data = pd.read_csv(INTERVAL_FILE) # Requires chrom, start, end columns.
bed = pd.read_csv(ANNOTATION_FILE) 

intervals = defaultdict(IntervalTree)
for i, r in enumerate(data.to_records()):
	intervals[r.chrom].addi(r.start, r.end, i)

mask = [False] * len(data) # Annotation mask!

for r in bed.to_records():
	# Envelop query.
	for (start, end, idx) in intervals[r.chrom].envelop(r.start, r.end):
		mask[idx] = True
	# Or, Overlap query.
	for (start, end, idx) in intervals[r.chrom][r.start:, r.end]:
		mask[idx] = True