Supplementary Table 3

Program source (Python, www.python.org) for analysing INS transcripts in EST libraries

 

# Copyright 2005 Tom Gaunt, Human Genetics Division, University of Southampton

# Free for research use

 

from Tkinter import *

from tkFileDialog import *

from re import *

class ThisProg:

 

    def __init__(self, root):

        frame = Frame(root)

        frame.grid()

 

        menu = Menu(root)

        root.config(menu=menu)

       

        filemenu = Menu(menu)

        menu.add_cascade(label="File", menu=filemenu)

        filemenu.add_command(label="Run", command=self.analysefile)

        filemenu.add_command(label="Exit", command=root.destroy)

 

       self.instruct1 = Label(frame, text="EST exon boundary detection\n\n TRG, SGEL, August 2005.")

       self.instruct1.grid(row=0, column=0)

 

       self.instruct2 = Label(frame, text="\nClick Run to select a file for analysis\n")

       self.instruct2.grid(row=1, column=0)

 

        self.calc = Button(frame, text="Run", command=self.analysefile)

        self.calc.grid(row=2, column=0)

 

        self.analysiscount = 0

 

    def analysefile(self):

        self.isoform1count = 0

        self.isoform2count = 0

        self.isoform3count = 0

        self.isoform4count = 0

        self.isoform5count = 0

        self.isoform6count = 0

        self.isoform7count = 0

        self.isoform8count = 0

        self.isoform9count = 0

        self.isoform10count = 0

        def isoform1 (): self.isoform1count = self.isoform1count + 1

        def isoform2 (): self.isoform2count = self.isoform2count + 1

        def isoform3 (): self.isoform3count = self.isoform3count + 1

        def isoform4 (): self.isoform4count = self.isoform4count + 1

        def isoform5 (): self.isoform5count = self.isoform5count + 1

        def isoform6 (): self.isoform6count = self.isoform6count + 1

        def isoform7 (): self.isoform7count = self.isoform7count + 1

        def isoform8 (): self.isoform8count = self.isoform8count + 1

        def isoform9 (): self.isoform9count = self.isoform9count + 1

        def isoform10 (): self.isoform10count = self.isoform10count + 1

        def noisoform (): print "noisoform"

        sequencesfilename = askopenfilename(defaultextension='.txt',filetypes=[('Sequence files','*.txt'),('All files','*.*')])

        resultsfilename = asksaveasfilename(defaultextension='.txt',filetypes=[('Result file','*.txt'),('All files','*.*')])

        sequencesfile = file(sequencesfilename)

        sequencesdata = sequencesfile.readlines()

        sequencesfile.close()

        output = "\n==================================================\n"

        # Set subsequences to search =============================================

        # Regexp - put options in square brackets - eg A[GT]A is either AGA or ATA

        exon1exon2 = ["AAGCAGATCACT","AGTGATCTGCTT"]

        exon2exon3 = ["TGCAGGTGGGGC","GCCCCACCTGCA"]

        exon1exon3 = ["AAGCAGTGGGGC","GCCCCACTGCTT"]

        fiveprimeintron1 = ["TTTGCGTCAGATCACTGT[CT]C","G[AG]ACAGTGATCTGACGCAAA"]    

        intron1 = ["TGTC[AT]CCCAGATCACTGT[CT]C","G[AG]ACAGTGATCTGGG[AT]GACA"]

        exon1cryptic3A = ["GCCATCAAGCAGGCAGCCTGCAGC","GCTGCAGGCTGCCTGCTTGATGGC"] # iso 8, 9 and 10 (rare)

        exon1cryptic3B = ["GCCATCAAGCAGCTGGAGAACTAC","GTAGTTCTCCAGCTGCTTGATGGC"] # iso 1, 3 and 5

        exon2cryptic3A = ["AGGACCTGCAGGGCAGCCTGCAGC","GCTGCAGGCTGCCCTGCAGGTCCT"] # iso 8, 9 and 10 (rare)

        exon2cryptic3B = ["AGGACCTGCAGGCTGGAGAACTAC","GTAGTTCTCCAGCCTGCAGGTCCT"] # iso 1, 3 and 5

        intron2five = ["TGCAGGGTGAGC","GCTCACCCTGCA"]

        intron2three = ["TGGCAGTGGGGC","GCCCCACTGCCA"]

        SNPS = {"INS72C":["ATCACTGTCCTTCTGCC","GGCAGAAGGACAGTGAT"],"INS72T":["ATCACTGTTCTTCTGCC","GGCAGAAGAACAGTGAT"],\

                "INS70G":["CTGCTGGCGCTGCTGGC","GCCAGCAGCGCCAGCAG"],"INS70A":["CTGCTGGCACTGCTGGC","GCCAGCAGTGCCAGCAG"],\

                "INS39C":["GCAGCCCCCCACCCGCC","GGCGGGTGGGGGGCTGC"],"INS39A":["GCAGCCCCACACCCGCC","GGCGGGTGTGGGGCTGC"],\

                "INS38C":["ACGCAGCCCGCAGGCAG","CTGCCTGCGGGCTGCGT"],"INS38T":["ACGCAGCCTGCAGGCAG","CTGCCTGCAGGCTGCGT"],\

                "INS69I":["GGTCTTTGCGTTCCAAG","CTTGGAACGCAAAGACC"],"INS69D":["GGTCTGTTCCAAGGGCC","GGCCCTTGGAACAGACC"]}

        SNPCOUNTS = {"INS72C":0,"INS72T":0,"INS70G":0,"INS70A":0, "INS39C":0,"INS39A":0,"INS38C":0,"INS38T":0,"INS69I":0,"INS69D":0}

        # End subsequences to search =============================================

        a = 0

        VNTRSNP = ["INS39C","INS39A"]

        exon1count = 0

        exon2count = 0

        intron1count = 0

        crypticcount = 0

        exon1_2 = 0

        exon2_3 = 0

        exon1_i1full_2 = 0

        exon1_i15_2 = 0

        fullyspliced = 0

        fiveprimei1retained = 0

        fiveprimeintron1count = 0

        # Order of tests in isoformdatabase

        # A exon1cryptic3Apresent

        # B exon1cryptic3Bpresent

        # C exon2cryptic3Apresent

        # D exon2cryptic3Bpresent

        # E intron2fivepresent

        # F intron2threepresent

        # G exon1_2

        # H exon2_3

        # I exon1_3

        # J exon1_i15_2

        # K exon1_i1full_2

        #                   ABCDEFGHIJK

        isoformdatabase = {"01000000000": isoform1,\

                           "00000000100": isoform2,\

                           "00010010000": isoform3,\

                           "00000011000": isoform4,\

                           "00010000001": isoform5,\

                           "00000001001": isoform6,\

                           "00000001010": isoform7,\

                           "10000000000": isoform8,\

                           "00100010000": isoform9,\

                           "00100000001": isoform10}

        isoformnamedatabase = {"01000000000": "isoform1",\

                           "00000000100": "isoform2",\

                           "00010010000": "isoform3",\

                           "00000011000": "isoform4",\

                           "00010000001": "isoform5",\

                           "00000001001": "isoform6",\

                           "00000001010": "isoform7",\

                           "10000000000": "isoform8",\

                           "00100010000": "isoform9",\

                           "00100000001": "isoform10"}  

 

        self.isoform1 = 0

        self.isoform2 = 0

        self.isoform3 = 0

        self.isoform4 = 0

        self.isoform5 = 0

        self.isoform6 = 0

        self.isoform7 = 0

        self.isoform8 = 0

        self.isoform9 = 0

        self.isoform10 = 0

        exon1cryptic3Apresent = 0

        exon1cryptic3Bpresent = 0

        exon2cryptic3Apresent = 0

        exon2cryptic3Bpresent = 0

        intron2fivepresent = 0

        intron2threepresent = 0

        exon1cryptic3Acount = 0

        exon1cryptic3Bcount = 0

        exon2cryptic3Acount = 0

        exon2cryptic3Bcount = 0

        intron2fivecount = 0

        intron2threecount = 0

        intron1count = 0

        intron1retained = 0

        exon1_2count = 0

        exon2_3count = 0

        exon1_3count = 0

        datasection = 0

        self.analysiscount = self.analysiscount + 1

        a = 0

        exon1count = 0

        exon2count = 0

        intron1count = 0

        crypticcount = 0

        substringcomb = ""

        exon1_2 = 0

        exon2_3 = 0

        exon1_i1full_2 = 0

        exon1_i15_2 = 0

        fullyspliced = 0

        fiveprimei1retained = 0

        fiveprimeintron1count = 0

        intron1count = 0

        intron1retained = 0

        for dataline in sequencesdata:

            data = dataline.split()

            if len(data)>0:

                # Set library name ===============================================

                # if data[3] == "searchterm":

                # if search("Human Pancreatic Islets", dataline) >= 1:

                if search("[Ii]nsulinoma", dataline) >= 1:

                # if search("HR85 [Ii]slet", dataline) >= 1:

                # End library name ===============================================

                    a = a + 1

                    fullline = dataline.split("len=")

                    thissequence = fullline[1]

                    for subseq in exon1exon2:

                        if search(subseq, thissequence)>=1:

                            exon1_2count = exon1_2count + 1

                            exon1_2 = 1

                    for subseq in exon2exon3:

                        if search(subseq, thissequence)>=1:

                            exon2_3count = exon2_3count + 1

                            exon2_3 = 1

                    for subseq in exon1exon3:

                        if search(subseq, thissequence)>=1:

                            exon1_3count = exon1_3count + 1

                            exon1_3 = 1                  

                    for subseq in fiveprimeintron1:

                        if search(subseq, thissequence)>=1:

                            fiveprimeintron1count = fiveprimeintron1count + 1

                            exon1_i15_2 = 1

                    for subseq in intron1:

                        if search(subseq, thissequence)>=1:

                            intron1count = intron1count + 1

                            exon1_i1full_2 = 1

                    for subseq in exon1cryptic3A:

                        if search(subseq, thissequence)>=1:

                            exon1cryptic3Acount = exon1cryptic3Acount + 1

                            exon1cryptic3Apresent = 1

                    for subseq in exon1cryptic3B:

                        if search(subseq, thissequence)>=1:

                            exon1cryptic3Bcount = exon1cryptic3Bcount + 1

                            exon1cryptic3Bpresent = 1

                    for subseq in exon2cryptic3A:

                        if search(subseq, thissequence)>=1:

                            exon2cryptic3Acount = exon2cryptic3Acount + 1

                            exon2cryptic3Apresent = 1

                    for subseq in exon2cryptic3B:

                        if search(subseq, thissequence)>=1:

                            exon2cryptic3Bcount = exon2cryptic3Bcount + 1

                            exon2cryptic3Bpresent = 1

                    for subseq in intron2five:

                        if search(subseq, thissequence)>=1:

                            intron2fivecount = intron2fivecount + 1

                            intron2fivepresent = 1

                    for subseq in intron2three:

                        if search(subseq, thissequence)>=1:

                            intron2threecount = intron2threecount + 1

                            intron2threepresent = 1

                    substringcomb = str(exon1cryptic3Apresent) + \

                                    str(exon1cryptic3Bpresent) + \

                                    str(exon2cryptic3Apresent) + \

                                    str(exon2cryptic3Bpresent) + \

                                    str(intron2fivepresent) + \

                                    str(intron2threepresent) + \

                                    str(exon1_2) + \

                                    str(exon2_3) + \

                                    str(exon1_3) + \

                                    str(exon1_i15_2) + \

                                    str(exon1_i1full_2)

                    isoformdatabase.get(substringcomb, noisoform)()

                   

                    for SNP in SNPS.keys():

                        for subseq in SNPS[SNP]:

                            if search(subseq, thissequence)>=1:

                                SNPCOUNTS[SNP] = SNPCOUNTS[SNP] + 1

                    output = output + isoformnamedatabase.get(substringcomb, "noisoform") + "\t" + dataline

 

            exon1cryptic3Apresent = 0

            exon1cryptic3Bpresent = 0

            exon2cryptic3Apresent = 0

            exon2cryptic3Bpresent = 0

            intron2fivepresent = 0

            intron2threepresent = 0

            exon1_2 = 0

            exon2_3 = 0

            exon1_3 = 0

            exon1_i15_2 = 0

            exon1_i1full_2 = 0

            substringcomb = ""

        SNPkeys = SNPS.keys()

        SNPkeys.sort()

        SNPkeys.reverse()

        for SNP in SNPkeys:

            output = "\n" + SNP + ": " + str(SNPCOUNTS[SNP]) + output

        output = "\n\n" + " Total sequences: " + str(a) + \

                         "\nisoform 1: " + str(self.isoform1count) + \

                         "\nisoform 2: " + str(self.isoform2count) + \

                         "\nisoform 3: " + str(self.isoform3count) + \

                         "\nisoform 4: " + str(self.isoform4count) + \

                         "\nisoform 5: " + str(self.isoform5count) + \

                         "\nisoform 6: " + str(self.isoform6count) + \

                         "\nisoform 7: " + str(self.isoform7count) + \

                         "\nisoform 8: " + str(self.isoform8count) + \

                         "\nisoform 9: " + str(self.isoform9count) + \

                         "\nisoform 10: " + str(self.isoform10count) + \

                         output

        resultsfile = file(resultsfilename,'w')

        resultsfile.write(output)

        resultsfile.close()

        self.instruct2.config(text= "Analysis number " + str(self.analysiscount) +  " completed\nSelect Exit from the File menu.\nOr click Run to start another\n")

        self.instruct2.update()

 

root = Tk()

app = ThisProg(root)

root.mainloop()