Supplementary Table 3

Program source (Python, www.python.org) for analysing INS transcripts in EST libraries

# Free for research use

from Tkinter import *

from tkFileDialog import *

from re import *

class ThisProg:

def __init__(self, root):

frame = Frame(root)

frame.grid()

menu = Menu(root)

root.config(menu=menu)

filemenu = Menu(menu)

menu.add_cascade(label="File", menu=filemenu)

filemenu.add_command(label="Run", command=self.analysefile)

filemenu.add_command(label="Exit", command=root.destroy)

self.instruct1 = Label(frame, text="EST exon boundary detection\n\n TRG, SGEL, August 2005.")

self.instruct1.grid(row=0, column=0)

self.instruct2 = Label(frame, text="\nClick Run to select a file for analysis\n")

self.instruct2.grid(row=1, column=0)

self.calc = Button(frame, text="Run", command=self.analysefile)

self.calc.grid(row=2, column=0)

self.analysiscount = 0

def analysefile(self):

self.isoform1count = 0

self.isoform2count = 0

self.isoform3count = 0

self.isoform4count = 0

self.isoform5count = 0

self.isoform6count = 0

self.isoform7count = 0

self.isoform8count = 0

self.isoform9count = 0

self.isoform10count = 0

def isoform1 (): self.isoform1count = self.isoform1count + 1

def isoform2 (): self.isoform2count = self.isoform2count + 1

def isoform3 (): self.isoform3count = self.isoform3count + 1

def isoform4 (): self.isoform4count = self.isoform4count + 1

def isoform5 (): self.isoform5count = self.isoform5count + 1

def isoform6 (): self.isoform6count = self.isoform6count + 1

def isoform7 (): self.isoform7count = self.isoform7count + 1

def isoform8 (): self.isoform8count = self.isoform8count + 1

def isoform9 (): self.isoform9count = self.isoform9count + 1

def isoform10 (): self.isoform10count = self.isoform10count + 1

def noisoform (): print "noisoform"

sequencesfilename = askopenfilename(defaultextension='.txt',filetypes=[('Sequence files','*.txt'),('All files','*.*')])

resultsfilename = asksaveasfilename(defaultextension='.txt',filetypes=[('Result file','*.txt'),('All files','*.*')])

sequencesfile = file(sequencesfilename)

sequencesdata = sequencesfile.readlines()

sequencesfile.close()

output = "\n==================================================\n"

# Set subsequences to search =============================================

# Regexp - put options in square brackets - eg A[GT]A is either AGA or ATA

exon1exon2 = ["AAGCAGATCACT","AGTGATCTGCTT"]

exon2exon3 = ["TGCAGGTGGGGC","GCCCCACCTGCA"]

exon1exon3 = ["AAGCAGTGGGGC","GCCCCACTGCTT"]

fiveprimeintron1 = ["TTTGCGTCAGATCACTGT[CT]C","G[AG]ACAGTGATCTGACGCAAA"]

intron1 = ["TGTC[AT]CCCAGATCACTGT[CT]C","G[AG]ACAGTGATCTGGG[AT]GACA"]

exon1cryptic3A = ["GCCATCAAGCAGGCAGCCTGCAGC","GCTGCAGGCTGCCTGCTTGATGGC"] # iso 8, 9 and 10 (rare)

exon1cryptic3B = ["GCCATCAAGCAGCTGGAGAACTAC","GTAGTTCTCCAGCTGCTTGATGGC"] # iso 1, 3 and 5

exon2cryptic3A = ["AGGACCTGCAGGGCAGCCTGCAGC","GCTGCAGGCTGCCCTGCAGGTCCT"] # iso 8, 9 and 10 (rare)

exon2cryptic3B = ["AGGACCTGCAGGCTGGAGAACTAC","GTAGTTCTCCAGCCTGCAGGTCCT"] # iso 1, 3 and 5

intron2five = ["TGCAGGGTGAGC","GCTCACCCTGCA"]

intron2three = ["TGGCAGTGGGGC","GCCCCACTGCCA"]

SNPS = {"INS72C":["ATCACTGTCCTTCTGCC","GGCAGAAGGACAGTGAT"],"INS72T":["ATCACTGTTCTTCTGCC","GGCAGAAGAACAGTGAT"],\

"INS70G":["CTGCTGGCGCTGCTGGC","GCCAGCAGCGCCAGCAG"],"INS70A":["CTGCTGGCACTGCTGGC","GCCAGCAGTGCCAGCAG"],\

"INS39C":["GCAGCCCCCCACCCGCC","GGCGGGTGGGGGGCTGC"],"INS39A":["GCAGCCCCACACCCGCC","GGCGGGTGTGGGGCTGC"],\

"INS38C":["ACGCAGCCCGCAGGCAG","CTGCCTGCGGGCTGCGT"],"INS38T":["ACGCAGCCTGCAGGCAG","CTGCCTGCAGGCTGCGT"],\

"INS69I":["GGTCTTTGCGTTCCAAG","CTTGGAACGCAAAGACC"],"INS69D":["GGTCTGTTCCAAGGGCC","GGCCCTTGGAACAGACC"]}

SNPCOUNTS = {"INS72C":0,"INS72T":0,"INS70G":0,"INS70A":0, "INS39C":0,"INS39A":0,"INS38C":0,"INS38T":0,"INS69I":0,"INS69D":0}

# End subsequences to search =============================================

a = 0

VNTRSNP = ["INS39C","INS39A"]

exon1count = 0

exon2count = 0

intron1count = 0

crypticcount = 0

exon1_2 = 0

exon2_3 = 0

exon1_i1full_2 = 0

exon1_i15_2 = 0

fullyspliced = 0

fiveprimei1retained = 0

fiveprimeintron1count = 0

# Order of tests in isoformdatabase

# A exon1cryptic3Apresent

# B exon1cryptic3Bpresent

# C exon2cryptic3Apresent

# D exon2cryptic3Bpresent

# E intron2fivepresent

# F intron2threepresent

# G exon1_2

# H exon2_3

# I exon1_3

# J exon1_i15_2

# K exon1_i1full_2

# ABCDEFGHIJK

isoformdatabase = {"01000000000": isoform1,\

"00000000100": isoform2,\

"00010010000": isoform3,\

"00000011000": isoform4,\

"00010000001": isoform5,\

"00000001001": isoform6,\

"00000001010": isoform7,\

"10000000000": isoform8,\

"00100010000": isoform9,\

"00100000001": isoform10}

isoformnamedatabase = {"01000000000": "isoform1",\

"00000000100": "isoform2",\

"00010010000": "isoform3",\

"00000011000": "isoform4",\

"00010000001": "isoform5",\

"00000001001": "isoform6",\

"00000001010": "isoform7",\

"10000000000": "isoform8",\

"00100010000": "isoform9",\

"00100000001": "isoform10"}

self.isoform1 = 0

self.isoform2 = 0

self.isoform3 = 0

self.isoform4 = 0

self.isoform5 = 0

self.isoform6 = 0

self.isoform7 = 0

self.isoform8 = 0

self.isoform9 = 0

self.isoform10 = 0

exon1cryptic3Apresent = 0

exon1cryptic3Bpresent = 0

exon2cryptic3Apresent = 0

exon2cryptic3Bpresent = 0

intron2fivepresent = 0

intron2threepresent = 0

exon1cryptic3Acount = 0

exon1cryptic3Bcount = 0

exon2cryptic3Acount = 0

exon2cryptic3Bcount = 0

intron2fivecount = 0

intron2threecount = 0

intron1count = 0

intron1retained = 0

exon1_2count = 0

exon2_3count = 0

exon1_3count = 0

datasection = 0

self.analysiscount = self.analysiscount + 1

a = 0

exon1count = 0

exon2count = 0

intron1count = 0

crypticcount = 0

substringcomb = ""

exon1_2 = 0

exon2_3 = 0

exon1_i1full_2 = 0

exon1_i15_2 = 0

fullyspliced = 0

fiveprimei1retained = 0

fiveprimeintron1count = 0

intron1count = 0

intron1retained = 0

for dataline in sequencesdata:

data = dataline.split()

if len(data)>0:

# Set library name ===============================================

# if data[3] == "searchterm":

# if search("Human Pancreatic Islets", dataline) >= 1:

if search("[Ii]nsulinoma", dataline) >= 1:

# if search("HR85 [Ii]slet", dataline) >= 1:

# End library name ===============================================

a = a + 1

fullline = dataline.split("len=")

thissequence = fullline[1]

for subseq in exon1exon2:

if search(subseq, thissequence)>=1:

exon1_2count = exon1_2count + 1

exon1_2 = 1

for subseq in exon2exon3:

if search(subseq, thissequence)>=1:

exon2_3count = exon2_3count + 1

exon2_3 = 1

for subseq in exon1exon3:

if search(subseq, thissequence)>=1:

exon1_3count = exon1_3count + 1

exon1_3 = 1

for subseq in fiveprimeintron1:

if search(subseq, thissequence)>=1:

fiveprimeintron1count = fiveprimeintron1count + 1

exon1_i15_2 = 1

for subseq in intron1:

if search(subseq, thissequence)>=1:

intron1count = intron1count + 1

exon1_i1full_2 = 1

for subseq in exon1cryptic3A:

if search(subseq, thissequence)>=1:

exon1cryptic3Acount = exon1cryptic3Acount + 1

exon1cryptic3Apresent = 1

for subseq in exon1cryptic3B:

if search(subseq, thissequence)>=1:

exon1cryptic3Bcount = exon1cryptic3Bcount + 1

exon1cryptic3Bpresent = 1

for subseq in exon2cryptic3A:

if search(subseq, thissequence)>=1:

exon2cryptic3Acount = exon2cryptic3Acount + 1

exon2cryptic3Apresent = 1

for subseq in exon2cryptic3B:

if search(subseq, thissequence)>=1:

exon2cryptic3Bcount = exon2cryptic3Bcount + 1

exon2cryptic3Bpresent = 1

for subseq in intron2five:

if search(subseq, thissequence)>=1:

intron2fivecount = intron2fivecount + 1

intron2fivepresent = 1

for subseq in intron2three:

if search(subseq, thissequence)>=1:

intron2threecount = intron2threecount + 1

intron2threepresent = 1

substringcomb = str(exon1cryptic3Apresent) + \

str(exon1cryptic3Bpresent) + \

str(exon2cryptic3Apresent) + \

str(exon2cryptic3Bpresent) + \

str(intron2fivepresent) + \

str(intron2threepresent) + \

str(exon1_2) + \

str(exon2_3) + \

str(exon1_3) + \

str(exon1_i15_2) + \

str(exon1_i1full_2)

isoformdatabase.get(substringcomb, noisoform)()

for SNP in SNPS.keys():

for subseq in SNPS[SNP]:

if search(subseq, thissequence)>=1:

SNPCOUNTS[SNP] = SNPCOUNTS[SNP] + 1

output = output + isoformnamedatabase.get(substringcomb, "noisoform") + "\t" + dataline

exon1cryptic3Apresent = 0

exon1cryptic3Bpresent = 0

exon2cryptic3Apresent = 0

exon2cryptic3Bpresent = 0

intron2fivepresent = 0

intron2threepresent = 0

exon1_2 = 0

exon2_3 = 0

exon1_3 = 0

exon1_i15_2 = 0

exon1_i1full_2 = 0

substringcomb = ""

SNPkeys = SNPS.keys()

SNPkeys.sort()

SNPkeys.reverse()

for SNP in SNPkeys:

output = "\n" + SNP + ": " + str(SNPCOUNTS[SNP]) + output

output = "\n\n" + " Total sequences: " + str(a) + \

"\nisoform 1: " + str(self.isoform1count) + \

"\nisoform 2: " + str(self.isoform2count) + \

"\nisoform 3: " + str(self.isoform3count) + \

"\nisoform 4: " + str(self.isoform4count) + \

"\nisoform 5: " + str(self.isoform5count) + \

"\nisoform 6: " + str(self.isoform6count) + \

"\nisoform 7: " + str(self.isoform7count) + \

"\nisoform 8: " + str(self.isoform8count) + \

"\nisoform 9: " + str(self.isoform9count) + \

"\nisoform 10: " + str(self.isoform10count) + \

output

resultsfile = file(resultsfilename,'w')

resultsfile.write(output)

resultsfile.close()

self.instruct2.config(text= "Analysis number " + str(self.analysiscount) + " completed\nSelect Exit from the File menu.\nOr click Run to start another\n")

self.instruct2.update()

root = Tk()

app = ThisProg(root)

root.mainloop()