This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Define usage | |
usage(){ | |
echo "Usage:" | |
echo "$0 sites perms size build" | |
echo "" | |
echo "Where:" | |
echo "sites: number of random autosomal sites per permutation" | |
echo "perms: number of permutations to conduct" | |
echo "size: total window size around site (ex: 50kb will be -/+ 25kb of site)" | |
echo "build: genome build (ex:hg18,hg19)" | |
echo "" | |
exit 1 | |
} | |
[[ $# -ne 4 ]] && usage | |
sites=$1 | |
perms=$2 | |
size=$3 | |
build=$4 | |
# Download gap and length files for correct genome build | |
if [[ $build == "hg19" ]] ; then | |
echo "Genome Build: hg19" | |
echo "Fetching gap and length files..." | |
wget -q -O - ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/gap.txt.gz | gunzip -c > gap.txt | |
wget -q -O - ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz | gunzip -c > cytoBand.txt | |
echo "Download complete." | |
elif [[ $build == "hg18" ]] ; then | |
echo "Genome Build: hg18" | |
echo "Fetching gap and length files..." | |
for i in {1..22} | |
do | |
wget -q -O - ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/chr${i}_gap.txt.gz | gunzip -c > chr${i}_gap.txt | |
done | |
cat chr1_gap.txt chr2_gap.txt chr3_gap.txt chr4_gap.txt chr5_gap.txt chr6_gap.txt chr7_gap.txt chr8_gap.txt chr9_gap.txt chr10_gap.txt chr11_gap.txt chr12_gap.txt chr13_gap.txt chr14_gap.txt chr15_gap.txt chr16_gap.txt chr17_gap.txt chr18_gap.txt chr19_gap.txt chr20_gap.txt chr21_gap.txt chr22_gap.txt > gap.txt | |
for i in {1..22} | |
do | |
rm chr${i}_gap.txt | |
done | |
wget -q -O - ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/cytoBand.txt.gz | gunzip -c > cytoBand.txt | |
echo "Download complete." | |
else | |
echo "ERROR: Genome build needs to be either hg18 or hg19" | |
exit 1 | |
fi | |
# Run Python Script | |
python make_random.py $sites $perms $size | |
# Remove gap and length files | |
rm gap.txt | |
rm cytoBand.txt | |
echo "Program completed." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from operator import itemgetter | |
# Define parameters | |
sites=int(sys.argv[1]) | |
perms=int(sys.argv[2]) | |
size=int(sys.argv[3]) | |
# Define functions | |
def chr_length(): | |
# Import chromosome cytobands | |
cyto=open("cytoBand.txt").readlines() | |
chr_len={} | |
for i in range(len(cyto)): | |
chr_raw,start,stop,id,g=cyto[i].strip().split() | |
chr=chr_raw.strip("chr") | |
if chr in chr_len: | |
if int(stop)>chr_len[chr]: | |
chr_len[chr]=int(stop) | |
else: | |
chr_len[chr]=int(stop) | |
del chr_len['Y'] | |
del chr_len['X'] | |
chr_lst=[] | |
for chr in sorted([int(x) for x in chr_len]): | |
chr_rep=[str(chr)] * int(round(chr_len[str(chr)]/10000000,0)) | |
for rep in chr_rep: | |
chr_lst.append(rep) | |
return chr_len, chr_lst | |
def chr_gaps(): | |
# Import chromosome gaps | |
gap_raw=open("gap.txt").readlines() | |
gaps={} | |
for i in range(len(gap_raw)): | |
bin,chr_raw,start,stop,ix,n,sz,type,br=gap_raw[i].strip().split() | |
chr=chr_raw.strip("chr") | |
gap_region=[int(start),int(stop)] | |
if chr in gaps: | |
gaps[chr].append(gap_region) | |
else: | |
gaps[chr]=[gap_region] | |
return gaps | |
def rand_sites(sites,size,seed_n,chr_len,chr_lst,gaps): | |
from random import choice,uniform,seed | |
seed(seed_n) | |
count=1 | |
# Choose chromosome and point on chromosome | |
while count<=sites: | |
chr=choice(chr_lst) | |
point=int(uniform(0+size/2,chr_len[chr]-size/2)) | |
# Exclude inaccessible regions | |
include="T" | |
for start,stop in gaps[chr]: | |
if start<=point<=stop: | |
include="F" | |
# Return points in accessible regions | |
if include=="T": | |
yield count,int(chr),point-size/2,point+size/2 | |
count+=1 | |
def sort_coords(coords,cols=itemgetter(1,2)): | |
unsorted=[] | |
coord=iter(coords) | |
item=next(coord,None) | |
while item: | |
unsorted.append(item) | |
item=next(coord,None) | |
sorted_regions=sorted(unsorted, key=cols) | |
for i in range(len(sorted_regions)): | |
yield sorted_regions[i] | |
def make_random(sites,size,seed_n,chr_len,chr_lst,gaps): | |
from operator import itemgetter | |
a=rand_sites(sites,size,seed_n,chr_len,chr_lst,gaps) | |
b=sort_coords(a) | |
return b | |
def make_bed(gen_out, out_file_dir): | |
out=open(out_file_dir, "w") | |
for count,chr,start,end in gen_out: | |
print >> out, "chr"+str(chr) +"\t"+ str(start) +"\t"+ str(end) +"\t"+ str(count) | |
def main(sites, perms, size): | |
chr_len,chr_lst=chr_length() | |
gaps=chr_gaps() | |
for i in range(1,perms+1): | |
a=make_random(sites,size,i,chr_len,chr_lst,gaps) | |
make_bed(a, "permutation_"+str(i)+".bed") | |
print "Permutation "+str(i)+" complete." | |
if __name__=='__main__': | |
main(sites, perms, size) |
Hi,
ReplyDeleteThank you for the function but what about chrX and chrY?
I see that all the BED files do not include any locations from these chromosomes.
Thanks!
Hi Michal,
DeleteYou are correct, this will only produce random genomic coordinates for the autosomes and not the sex chromosomes. The code can be modified to include the sex chromosomes. In the UNIX shell script for hg18 include the chrX and chrY gap files. In the Python script, comment out lines 23 and 24. I haven't tested this, but it should get you pretty close to producing what you need.
I tried this script and this is the error I got:
ReplyDeleteGenome Build: hg19
Fetching gap and length files...
Download complete.
Traceback (most recent call last):
File "make_random.py", line 72, in ?
def sort_coords(coords,cols=itemgetter(1,2)):
TypeError: itemgetter expected 1 arguments, got 2
Program completed.
Any idea why? Thanks.