Supplementary Table 3
Program source (Python, www.python.org) for analysing INS transcripts in EST libraries
# Copyright 2005 Tom Gaunt, Human Genetics Division, University of Southampton
# Free for research use
from Tkinter import *
from tkFileDialog import *
from re import *
class ThisProg:
def __init__(self, root):
frame = Frame(root)
frame.grid()
menu = Menu(root)
root.config(menu=menu)
filemenu = Menu(menu)
menu.add_cascade(label="File", menu=filemenu)
filemenu.add_command(label="Run", command=self.analysefile)
filemenu.add_command(label="Exit", command=root.destroy)
self.instruct1 = Label(frame, text="EST exon boundary detection\n\n TRG, SGEL, August 2005.")
self.instruct1.grid(row=0, column=0)
self.instruct2 = Label(frame, text="\nClick Run to select a file for analysis\n")
self.instruct2.grid(row=1, column=0)
self.calc = Button(frame, text="Run", command=self.analysefile)
self.calc.grid(row=2, column=0)
self.analysiscount = 0
def analysefile(self):
self.isoform1count = 0
self.isoform2count = 0
self.isoform3count = 0
self.isoform4count = 0
self.isoform5count = 0
self.isoform6count = 0
self.isoform7count = 0
self.isoform8count = 0
self.isoform9count = 0
self.isoform10count = 0
def isoform1 (): self.isoform1count = self.isoform1count + 1
def isoform2 (): self.isoform2count = self.isoform2count + 1
def isoform3 (): self.isoform3count = self.isoform3count + 1
def isoform4 (): self.isoform4count = self.isoform4count + 1
def isoform5 (): self.isoform5count = self.isoform5count + 1
def isoform6 (): self.isoform6count = self.isoform6count + 1
def isoform7 (): self.isoform7count = self.isoform7count + 1
def isoform8 (): self.isoform8count = self.isoform8count + 1
def isoform9 (): self.isoform9count = self.isoform9count + 1
def isoform10 (): self.isoform10count = self.isoform10count + 1
def noisoform (): print "noisoform"
sequencesfilename = askopenfilename(defaultextension='.txt',filetypes=[('Sequence files','*.txt'),('All files','*.*')])
resultsfilename = asksaveasfilename(defaultextension='.txt',filetypes=[('Result file','*.txt'),('All files','*.*')])
sequencesfile = file(sequencesfilename)
sequencesdata = sequencesfile.readlines()
sequencesfile.close()
output = "\n==================================================\n"
# Set subsequences to search =============================================
# Regexp - put options in square brackets - eg A[GT]A is either AGA or ATA
exon1exon2 = ["AAGCAGATCACT","AGTGATCTGCTT"]
exon2exon3 = ["TGCAGGTGGGGC","GCCCCACCTGCA"]
exon1exon3 = ["AAGCAGTGGGGC","GCCCCACTGCTT"]
fiveprimeintron1 = ["TTTGCGTCAGATCACTGT[CT]C","G[AG]ACAGTGATCTGACGCAAA"]
intron1 = ["TGTC[AT]CCCAGATCACTGT[CT]C","G[AG]ACAGTGATCTGGG[AT]GACA"]
exon1cryptic3A = ["GCCATCAAGCAGGCAGCCTGCAGC","GCTGCAGGCTGCCTGCTTGATGGC"] # iso 8, 9 and 10 (rare)
exon1cryptic3B = ["GCCATCAAGCAGCTGGAGAACTAC","GTAGTTCTCCAGCTGCTTGATGGC"] # iso 1, 3 and 5
exon2cryptic3A = ["AGGACCTGCAGGGCAGCCTGCAGC","GCTGCAGGCTGCCCTGCAGGTCCT"] # iso 8, 9 and 10 (rare)
exon2cryptic3B = ["AGGACCTGCAGGCTGGAGAACTAC","GTAGTTCTCCAGCCTGCAGGTCCT"] # iso 1, 3 and 5
intron2five = ["TGCAGGGTGAGC","GCTCACCCTGCA"]
intron2three = ["TGGCAGTGGGGC","GCCCCACTGCCA"]
SNPS = {"INS72C":["ATCACTGTCCTTCTGCC","GGCAGAAGGACAGTGAT"],"INS72T":["ATCACTGTTCTTCTGCC","GGCAGAAGAACAGTGAT"],\
"INS70G":["CTGCTGGCGCTGCTGGC","GCCAGCAGCGCCAGCAG"],"INS70A":["CTGCTGGCACTGCTGGC","GCCAGCAGTGCCAGCAG"],\
"INS39C":["GCAGCCCCCCACCCGCC","GGCGGGTGGGGGGCTGC"],"INS39A":["GCAGCCCCACACCCGCC","GGCGGGTGTGGGGCTGC"],\
"INS38C":["ACGCAGCCCGCAGGCAG","CTGCCTGCGGGCTGCGT"],"INS38T":["ACGCAGCCTGCAGGCAG","CTGCCTGCAGGCTGCGT"],\
"INS69I":["GGTCTTTGCGTTCCAAG","CTTGGAACGCAAAGACC"],"INS69D":["GGTCTGTTCCAAGGGCC","GGCCCTTGGAACAGACC"]}
SNPCOUNTS = {"INS72C":0,"INS72T":0,"INS70G":0,"INS70A":0, "INS39C":0,"INS39A":0,"INS38C":0,"INS38T":0,"INS69I":0,"INS69D":0}
# End subsequences to search =============================================
a = 0
VNTRSNP = ["INS39C","INS39A"]
exon1count = 0
exon2count = 0
intron1count = 0
crypticcount = 0
exon1_2 = 0
exon2_3 = 0
exon1_i1full_2 = 0
exon1_i15_2 = 0
fullyspliced = 0
fiveprimei1retained = 0
fiveprimeintron1count = 0
# Order of tests in isoformdatabase
# A exon1cryptic3Apresent
# B exon1cryptic3Bpresent
# C exon2cryptic3Apresent
# D exon2cryptic3Bpresent
# E intron2fivepresent
# F intron2threepresent
# G exon1_2
# H exon2_3
# I exon1_3
# J exon1_i15_2
# K exon1_i1full_2
# ABCDEFGHIJK
isoformdatabase = {"01000000000": isoform1,\
"00000000100": isoform2,\
"00010010000": isoform3,\
"00000011000": isoform4,\
"00010000001": isoform5,\
"00000001001": isoform6,\
"00000001010": isoform7,\
"10000000000": isoform8,\
"00100010000": isoform9,\
"00100000001": isoform10}
isoformnamedatabase = {"01000000000": "isoform1",\
"00000000100": "isoform2",\
"00010010000": "isoform3",\
"00000011000": "isoform4",\
"00010000001": "isoform5",\
"00000001001": "isoform6",\
"00000001010": "isoform7",\
"10000000000": "isoform8",\
"00100010000": "isoform9",\
"00100000001": "isoform10"}
self.isoform1 = 0
self.isoform2 = 0
self.isoform3 = 0
self.isoform4 = 0
self.isoform5 = 0
self.isoform6 = 0
self.isoform7 = 0
self.isoform8 = 0
self.isoform9 = 0
self.isoform10 = 0
exon1cryptic3Apresent = 0
exon1cryptic3Bpresent = 0
exon2cryptic3Apresent = 0
exon2cryptic3Bpresent = 0
intron2fivepresent = 0
intron2threepresent = 0
exon1cryptic3Acount = 0
exon1cryptic3Bcount = 0
exon2cryptic3Acount = 0
exon2cryptic3Bcount = 0
intron2fivecount = 0
intron2threecount = 0
intron1count = 0
intron1retained = 0
exon1_2count = 0
exon2_3count = 0
exon1_3count = 0
datasection = 0
self.analysiscount = self.analysiscount + 1
a = 0
exon1count = 0
exon2count = 0
intron1count = 0
crypticcount = 0
substringcomb = ""
exon1_2 = 0
exon2_3 = 0
exon1_i1full_2 = 0
exon1_i15_2 = 0
fullyspliced = 0
fiveprimei1retained = 0
fiveprimeintron1count = 0
intron1count = 0
intron1retained = 0
for dataline in sequencesdata:
data = dataline.split()
if len(data)>0:
# Set library name ===============================================
# if data[3] == "searchterm":
# if search("Human Pancreatic Islets", dataline) >= 1:
if search("[Ii]nsulinoma", dataline) >= 1:
# if search("HR85 [Ii]slet", dataline) >= 1:
# End library name ===============================================
a = a + 1
fullline = dataline.split("len=")
thissequence = fullline[1]
for subseq in exon1exon2:
if search(subseq, thissequence)>=1:
exon1_2count = exon1_2count + 1
exon1_2 = 1
for subseq in exon2exon3:
if search(subseq, thissequence)>=1:
exon2_3count = exon2_3count + 1
exon2_3 = 1
for subseq in exon1exon3:
if search(subseq, thissequence)>=1:
exon1_3count = exon1_3count + 1
exon1_3 = 1
for subseq in fiveprimeintron1:
if search(subseq, thissequence)>=1:
fiveprimeintron1count = fiveprimeintron1count + 1
exon1_i15_2 = 1
for subseq in intron1:
if search(subseq, thissequence)>=1:
intron1count = intron1count + 1
exon1_i1full_2 = 1
for subseq in exon1cryptic3A:
if search(subseq, thissequence)>=1:
exon1cryptic3Acount = exon1cryptic3Acount + 1
exon1cryptic3Apresent = 1
for subseq in exon1cryptic3B:
if search(subseq, thissequence)>=1:
exon1cryptic3Bcount = exon1cryptic3Bcount + 1
exon1cryptic3Bpresent = 1
for subseq in exon2cryptic3A:
if search(subseq, thissequence)>=1:
exon2cryptic3Acount = exon2cryptic3Acount + 1
exon2cryptic3Apresent = 1
for subseq in exon2cryptic3B:
if search(subseq, thissequence)>=1:
exon2cryptic3Bcount = exon2cryptic3Bcount + 1
exon2cryptic3Bpresent = 1
for subseq in intron2five:
if search(subseq, thissequence)>=1:
intron2fivecount = intron2fivecount + 1
intron2fivepresent = 1
for subseq in intron2three:
if search(subseq, thissequence)>=1:
intron2threecount = intron2threecount + 1
intron2threepresent = 1
substringcomb = str(exon1cryptic3Apresent) + \
str(exon1cryptic3Bpresent) + \
str(exon2cryptic3Apresent) + \
str(exon2cryptic3Bpresent) + \
str(intron2fivepresent) + \
str(intron2threepresent) + \
str(exon1_2) + \
str(exon2_3) + \
str(exon1_3) + \
str(exon1_i15_2) + \
str(exon1_i1full_2)
isoformdatabase.get(substringcomb, noisoform)()
for SNP in SNPS.keys():
for subseq in SNPS[SNP]:
if search(subseq, thissequence)>=1:
SNPCOUNTS[SNP] = SNPCOUNTS[SNP] + 1
output = output + isoformnamedatabase.get(substringcomb, "noisoform") + "\t" + dataline
exon1cryptic3Apresent = 0
exon1cryptic3Bpresent = 0
exon2cryptic3Apresent = 0
exon2cryptic3Bpresent = 0
intron2fivepresent = 0
intron2threepresent = 0
exon1_2 = 0
exon2_3 = 0
exon1_3 = 0
exon1_i15_2 = 0
exon1_i1full_2 = 0
substringcomb = ""
SNPkeys = SNPS.keys()
SNPkeys.sort()
SNPkeys.reverse()
for SNP in SNPkeys:
output = "\n" + SNP + ": " + str(SNPCOUNTS[SNP]) + output
output = "\n\n" + " Total sequences: " + str(a) + \
"\nisoform 1: " + str(self.isoform1count) + \
"\nisoform 2: " + str(self.isoform2count) + \
"\nisoform 3: " + str(self.isoform3count) + \
"\nisoform 4: " + str(self.isoform4count) + \
"\nisoform 5: " + str(self.isoform5count) + \
"\nisoform 6: " + str(self.isoform6count) + \
"\nisoform 7: " + str(self.isoform7count) + \
"\nisoform 8: " + str(self.isoform8count) + \
"\nisoform 9: " + str(self.isoform9count) + \
"\nisoform 10: " + str(self.isoform10count) + \
output
resultsfile = file(resultsfilename,'w')
resultsfile.write(output)
resultsfile.close()
self.instruct2.config(text= "Analysis number " + str(self.analysiscount) + " completed\nSelect Exit from the File menu.\nOr click Run to start another\n")
self.instruct2.update()
root = Tk()
app = ThisProg(root)
root.mainloop()