Sync that last bit with trunk. I'll have to merge that over to the tag for the next RC.
[bioperl-live.git] / examples / align / simplealign.pl
blob7119bc2c71574de215246719240eaa01dc50deb5
1 #!/usr/bin/perl
3 # PROGRAM : simplealign.pl
4 # PURPOSE : Simple driver for Bio::SimpleAlign
5 # AUTHOR : Ewan Birney birney@sanger.ac.uk
6 # CREATED : Tue Oct 27 1998
7 # REVISION : $Id$
9 # INSTALLATION
10 # If you have installed bioperl using the standard
11 # makefile system everything should be fine and
12 # dandy.
14 # if not edit the use lib "...." line to point the directory
15 # containing your Bioperl modules.
18 #use lib "/nfs/disk21/birney/prog/bioperl/";
20 # Modified 3/5/01 to use AlignIO by Peter Schattner schattner@alum.mit.edu
23 # This uses the internal DATA stream (past the end of this
24 # file, on the __END__ tag) to load in the data. We then
25 # do some reformats, sort in a different way and a quick
26 # getting into the alignment. All pretty simple ;)
30 # The simplealign module does not do the following things
31 # a) give you sensible ways of asking if residues are a
32 # column of gaps or conservation
33 # b) provide ways of editing the alignment
34 # c) making alignments
38 # a) and b) are probably best done by UnivAlign from Georg Fuellen
39 # c) is done for pairwise alignments in Bio::Tools::pSW; and
40 # also you can read in stuff from programs like clustal and hmmer
41 # into this.
44 use strict;
45 use Bio::SimpleAlign;
46 use Bio::AlignIO;
48 # read from a stream
49 my $str = Bio::AlignIO->newFh('-fh'=> \*DATA, '-format' => 'pfam' );
50 my $al = <$str>;
52 # write out a MSF file
53 my $out = Bio::AlignIO->newFh('-fh'=> \*STDOUT, '-format' => 'msf');
54 my $status = print $out $al;
56 # order by alphabetically then start end
57 $al->sort_alphabetically();
59 # write in Pfam format now...
60 my $out2=Bio::AlignIO->newFh( '-fh'=> \*STDOUT, '-format' => 'pfam');
61 $status = print $out2 $al;
63 # now set the display name to be
64 # name_# like roa1_human_1, roa1_human_2 etc
65 # This **doesn't** change the underlying names of the
66 # sequences you'll be glad to hear.
68 $al->set_displayname_count();
70 # dump again... bored of this yet?
72 $status = print $out2 $al;
74 # get into the alignment and get things out
75 # we just want to see how many unique names
76 # there are in this alignment
78 my ($seq, $id, %hash) ;
80 # loop over the alignment
81 foreach $seq ( $al->eachSeq() ) {
82 # increment a hash on the name by one each time
83 $hash{$seq->id()}++;
86 # disgorge the hash
88 foreach $id ( keys %hash ) {
89 print "$id has $hash{$id} subsequences in this alignment\n";
92 __END__
93 GR10_BRANA/8-79 CFVGGL......AWATGDAELERTFS.....Q.FGEV..IDSKIIND.............RETGRSRGFGFVTFKDEKSMKDAIDEMNG.K...ELDGRTITV
94 HUD_HUMAN/48-119 LIVNYL......PQNMTQEEFRSLFG.....S.IGEI..ESCKLVRD.............KITGQSLGYGFVNYIDPKDAEKAINTLNG.L...RLQTKTIKV
95 IF32_SCHPO/41-124 VVIEGAP....VVEEAKQQDFFRFLSSKVLAK.IGKVKENGFYMPFE.........EKNGK..KMSLGLVFADFENVDGADLCVQELDGKQ...ILKNHTFVV
96 IF32_YEAST/79-157 IVVNGAPVIPSAKVPVLKKALTSLFS.....K.AGKV..VNMEFPID.............EATGKTKGFLFVECGSMNDAKKIIKSFHGKR...LDLKHRLFL
97 IF4B_HUMAN/98-168 AFLGNL......PYDVTEESIKEFFR.....G.LNIS...AVRLPR............EPSNPERLKGFGYAEFEDLDSLLSALS.LNE.E...SLGNRRIRV
98 LA_DROME/151-225 AYAKGF......PLDSQISELLDFTA.....N.YDKV..VNLTMRNS.........YDKPTKSYKFKGSIFLTFETKDQAKAFLE.QEK.I...VYKERELLR
99 LA_HUMAN/113-182 VYIKGF......PTDATLDDIKEWLE.....D.KGQV..LNIQMRR..............TLHKAFKGSIFVVFDSIESAKKFVE.TPG.Q...KYKETDLLI
100 MEI2_SCHPO/197-265 LFVTNL......PRIVPYATLLELFS.....K.LGDV..KGIDTSSL.................STDGICIVAFFDIRQAIQAAKSLRSQR...FFNDRLLYF
101 MODU_DROME/177-246 VFVTNL......PNEYLHKDLVALFA.....K.FGRL..SALQRFTN................LNGNKSVLIAFDTSTGAEAVLQAKPKAL...TLGDNVLSV
102 MODU_DROME/260-326 VVVGLI......GPNITKDDLKTFFE.....K.VAPV..EAVTISSN.................RLMPRAFVRLASVDDIPKALK.LHS.T...ELFSRFITV
103 MODU_DROME/342-410 LVVENVG....KHESYSSDALEKIFK.....K.FGDV..EEIDVVC..................SKAVLAFVTFKQSDAATKALAQLDG.K...TVNKFEWKL
104 MODU_DROME/422-484 ILVTNL......TSDATEADLRKVFN.....D.SGEI..ESIIMLG.....................QKAVVKFKDDEGFCKSFL.ANE.S...IVNNAPIFI
105 MSSP_HUMAN/31-102 LYIRGL......PPHTTDQDLVKLCQ.....P.YGKI..VSTKAILD.............KTTNKCKGYGFVDFDSPAAAQKAVSALKA.S...GVQAQKAKQ
106 NAM8_YEAST/165-237 IFVGDL......APNVTESQLFELFI.....NRYAST..SHAKIVHD.............QVTGMSKGYGFVKFTNSDEQQLALSEMQG.V...FLNGRAIKV
107 NONA_DROME/304-369 LYVGNL......TNDITDDELREMFK.....P.YGEI..SEIFSNLD...................KNFTFLKVDYHPNAEKAKRALDG.S...MRKGRQLRV
108 NONA_DROME/378-448 LRVSNL......TPFVSNELLYKSFE.....I.FGPI..ERASITVD..............DRGKHMGEGIVEFAKKSSASACLRMCNE.K...CFFLTASLR
109 NOP3_YEAST/127-190 LFVRPF......PLDVQESELNEIFG.....P.FGPM..KEVKILN.....................GFAFVEFEEAESAAKAIEEVHG.K...SFANQPLEV
110 NOP3_YEAST/202-270 ITMKNL......PEGCSWQDLKDLAR.....E.NSLE..TTFSSVN................TRDFDGTGALEFPSEEILVEALERLNN.I...EFRGSVITV
111 NOP4_YEAST/28-98 LFVRSI......PQDVTDEQLADFFS.....N.FAPI..KHAVVVKD..............TNKRSRGFGFVSFAVEDDTKEALAKARK.T...KFNGHILRV
112 NOP4_YEAST/292-363 VFVRNV......PYDATEESLAPHFS.....K.FGSV..KYALPVID.............KSTGLAKGTAFVAFKDQYTYNECIKNAPA.A...GSTSLLIGD
113 NSR1_YEAST/170-241 IFVGRL......SWSIDDEWLKKEFE.....H.IGGV..IGARVIYE.............RGTDRSRGYGYVDFENKSYAEKAIQEMQG.K...EIDGRPINC
114 NSR1_YEAST/269-340 LFLGNL......SFNADRDAIFELFA.....K.HGEV..VSVRIPTH.............PETEQPKGFGYVQFSNMEDAKKALDALQG.E...YIDNRPVRL
115 NUCL_CHICK/283-352 LFVKNL......TPTKDYEELRTAIK.....EFFGKK...NLQVSEV..............RIGSSKRFGYVDFLSAEDMDKALQ.LNG.K...KLMGLEIKL
116 PABP_DROME/4-75 LYVGDL......PQDVNESGLFDKFS.....S.AGPV..LSIRVCRD.............VITRRSLGYAYVNFQQPADAERALDTMNF.D...LVRNKPIRI
117 PABP_DROME/92-162 VFIKNL......DRAIDNKAIYDTFS.....A.FGNI..LSCKVATD..............EKGNSKGYGFVHFETEEAANTSIDKVNG.M...LLNGKKVYV
118 PABP_DROME/183-254 VYVKNF......TEDFDDEKLKEFFE.....P.YGKI..TSYKVMS..............KEDGKSKGFGFVAFETTEAAEAAVQALNGKD...MGEGKSLYV
119 PABP_SCHPO/249-319 VYIKNL......DTEITEQEFSDLFG.....Q.FGEI..TSLSLVKD..............QNDKPRGFGFVNYANHECAQKAVDELND.K...EYKGKKLYV
120 PES4_YEAST/93-164 LFIGDL......HETVTEETLKGIFK.....K.YPSF..VSAKVCLD.............SVTKKSLGHGYLNFEDKEEAEKAMEELNY.T...KVNGKEIRI
121 PES4_YEAST/305-374 IFIKNL......PTITTRDDILNFFS.....E.VGPI..KSIYLSN...............ATKVKYLWAFVTYKNSSDSEKAIKRYNN.F...YFRGKKLLV
122 PR24_YEAST/43-111 VLVKNL......PKSYNQNKVYKYFK.....H.CGPI..IHVDVAD...............SLKKNFRFARIEFARYDGALAAIT.KTH.K...VVGQNEIIV
123 PR24_YEAST/119-190 LWMTNF......PPSYTQRNIRDLLQ.....D.INVV.ALSIRLPSL..............RFNTSRRFAYIDVTSKEDARYCVEKLNG.L...KIEGYTLVT
124 PR24_YEAST/212-284 IMIRNL.....STELLDENLLRESFE.....G.FGSI..EKINIPAG............QKEHSFNNCCAFMVFENKDSAERALQ.MNR.S...LLGNREISV
125 PSF_HUMAN/373-443 LSVRNL......SPYVSNELLEEAFS.....Q.FGPI..ERAVVIVD..............DRGRSTGKGIVEFASKPAARKAFERCSE.G...VFLLTTTPR
126 PTB_HUMAN/61-128 IHIRKL......PIDVTEGEVISLGL.....P.FGKV..TNLLMLKG...................KNQAFIEMNTEEAANTMVN.YYT.SVTPVLRGQPIYI
127 PTB_HUMAN/186-253 IIVENL......FYPVTLDVLHQIFS.....K.FGTV....LKIIT...............FTKNNQFQALLQYADPVSAQHAKLSLDG.Q...NIYNACCTL
128 PUB1_YEAST/76-146 LYVGNL......DKAITEDILKQYFQ.....V.GGPI..ANIKIMID..............KNNKNVNYAFVEYHQSHDANIALQTLNG.K...QIENNIVKI
129 PUB1_YEAST/163-234 LFVGDL......NVNVDDETLRNAFK.....D.FPSY..LSGHVMWD.............MQTGSSRGYGFVSFTSQDDAQNAMDSMQG.Q...DLNGRPLRI
130 PUB1_YEAST/342-407 AYIGNI......PHFATEADLIPLFQ.....N.FGFI..LDFKHYPE...................KGCCFIKYDTHEQAAVCIVALAN.F...PFQGRNLRT
131 RB97_DROME/34-104 LFIGGL......APYTTEENLKLFYG.....Q.WGKV..VDVVVMRD.............AATKRSRGFGFITYTKSLMVDRAQE..NRPH...IIDGKTVEA
132 RN12_YEAST/200-267 IVIKFQ......GPALTEEEIYSLFR.....R.YGTI....IDIFP...............PTAANNNVAKVRYRSFRGAISAKNCVSG.I...EIHNTVLHI
133 RN15_YEAST/20-91 VYLGSI......PYDQTEEQILDLCS.....N.VGPV..INLKMMFD.............PQTGRSKGYAFIEFRDLESSASAVRNLNG.Y...QLGSRFLKC
134 RNP1_YEAST/37-109 LYVGNL......PKNCRKQDLRDLFE.....PNYGKI..TINMLKKK.............PLKKPLKRFAFIEFQEGVNLKKVKEKMNG.K...IFMNEKIVI
135 RO28_NICSY/99-170 LFVGNL......PYDIDSEGLAQLFQ.....Q.AGVV..EIAEVIYN.............RETDRSRGFGFVTMSTVEEADKAVELYSQ.Y...DLNGRLLTV
136 RO33_NICSY/116-187 LYVGNL......PFSMTSSQLSEIFA.....E.AGTV..ANVEIVYD.............RVTDRSRGFAFVTMGSVEEAKEAIRLFDG.S...QVGGRTVKV
137 RO33_NICSY/219-290 LYVANL......SWALTSQGLRDAFA.....D.QPGF..MSAKVIYD.............RSSGRSRGFGFITFSSAEAMNSALDTMNE.V...ELEGRPLRL
138 ROA1_BOVIN/106-176 IFVGGI......KEDTEEHHLRDYFE.....Q.YGKI..EVIEIMTD.............RGSGKKRGFAFVTFDDHDSVDKIVI.QKY.H...TVNGHNCEV
139 ROC_HUMAN/18-82 VFIGNL.....NTLVVKKSDVEAIFS.....K.YGKI..VGCSVHK.....................GFAFVQYVNERNARAAVAGEDG.R...MIAGQVLDI
140 ROF_HUMAN/113-183 VRLRGL......PFGCTKEEIVQFFS.....G.LEIV.PNGITLPVD..............PEGKITGEAFVQFASQELAEKALG.KHK.E...RIGHRYIEV
141 ROG_HUMAN/10-81 LFIGGL......NTETNEKALEAVFG.....K.YGRI..VEVLLMKD.............RETNKSRGFAFVTFESPADAKDAARDMNG.K...SLDGKAIKV
142 RT19_ARATH/33-104 LYIGGL......SPGTDEHSLKDAFS.....S.FNGV..TEARVMTN.............KVTGRSRGYGFVNFISEDSANSAISAMNG.Q...ELNGFNISV
143 RU17_DROME/104-175 LFIARI......NYDTSESKLRREFE.....F.YGPI..KKIVLIHD.............QESGKPKGYAFIEYEHERDMHAAYKHADG.K...KIDSKRVLV
144 RU1A_HUMAN/12-84 IYINNLNE..KIKKDELKKSLYAIFS.....Q.FGQI..LDILVSR................SLKMRGQAFVIFKEVSSATNALRSMQG.F...PFYDKPMRI
145 RU1A_HUMAN/210-276 LFLTNL......PEETNELMLSMLFN.....Q.FPGF..KEVRLVPG..................RHDIAFVEFDNEVQAGAARDALQG.F...KITQNNAMK
146 RU1A_YEAST/229-293 LLIQNL......PSGTTEQLLSQILG.....N.EALV...EIRLVSV...................RNLAFVEYETVADATKIKNQLGS.T...YKLQNNDVT
147 RU2B_HUMAN/9-81 IYINNMND..KIKKEELKRSLYALFS.....Q.FGHV..VDIVALK................TMKMRGQAFVIFKELGSSTNALRQLQG.F...PFYGKPMRI
148 RU2B_HUMAN/153-220 LFLNNL......PEETNEMMLSMLFN.....Q.FPGF..KEVRLVPG..................RHDIAFVEFENDGQAGAARDALQGFK...ITPSHAMKI
149 SC35_CHICK/16-87 LKVDNL......TYRTSPDTLRRVFE.....K.YGRV..GDVYIPRD.............RYTKESRGFAFVRFHDKRDAEDAMDAMDG.A...VLDGRELRV
150 SP33_HUMAN/17-85 IYVGNL......PPDIRTKDIEDVFY.....K.YGAI..RDIDLKNR................RGGPPFAFVEFEDPRDAEDAVYGRDG.Y...DYDGYRLRV
151 SP33_HUMAN/122-186 VVVSGL......PPSGSWQDLKDHMR.....E.AGDV..CYADVYRD....................GTGVVEFVRKEDMTYAVRKLDN.T...KFRSHEGET
152 SQD_DROME/58-128 LFVGGL......SWETTEKELRDHFG.....K.YGEI..ESINVKTD.............PQTGRSRGFAFIVFTNTEAIDKVSA.ADE.H...IINSKKVDP
153 SQD_DROME/138-208 IFVGGL......TTEISDEEIKTYFG.....Q.FGNI..VEVEMPLD.............KQKSQRKGFCFITFDSEQVVTDLLK.TPK.Q...KIAGKEVDV
154 SR55_DROME/5-68 VYVGGL......PYGVRERDLERFFK.....G.YGRT..RDILIKN.....................GYGFVEFEDYRDADDAVYELNG.K...ELLGERVVV
155 SSB1_YEAST/39-114 IFIGNV......AHECTEDDLKQLFV.....EEFGDE..VSVEIPIK..........EHTDGHIPASKHALVKFPTKIDFDNIKENYDT.K...VVKDREIHI
156 SXLF_DROME/127-198 LIVNYL......PQDMTDRELYALFR.....A.IGPI..NTCRIMRD.............YKTGYSFGYAFVDFTSEMDSQRAIKVLNG.I...TVRNKRLKV
157 SXLF_DROME/213-285 LYVTNL......PRTITDDQLDTIFG.....K.YGSI..VQKNILRD.............KLTGRPRGVAFVRYNKREEAQEAISALNNVI...PEGGSQPLS
158 TIA1_HUMAN/9-78 LYVGNL......SRDVTEALILQLFS.....Q.IGPC..KNCKMIMD...............TAGNDPYCFVEFHEHRHAAAALAAMNG.R...KIMGKEVKV
159 TIA1_HUMAN/97-168 VFVGDL......SPQITTEDIKAAFA.....P.FGRI..SDARVVKD.............MATGKSKGYGFVSFFNKWDAENAIQQMGG.Q...WLGGRQIRT
160 TIA1_HUMAN/205-270 VYCGGV......TSGLTEQLMRQTFS.....P.FGQI..MEIRVFPD...................KGYSFVRFNSHESAAHAIVSVNG.T...TIEGHVVKC
161 TRA2_DROME/99-170 IGVFGL......NTNTSQHKVRELFN.....K.YGPI..ERIQMVID.............AQTQRSRGFCFIYFEKLSDARAAKDSCSG.I...EVDGRRIRV
162 U2AF_HUMAN/261-332 LFIGGL......PNYLNDDQVKELLT.....S.FGPL..KAFNLVKD.............SATGLSKGYAFCEYVDINVTDQAIAGLNG.M...QLGDKKLLV
163 U2AF_SCHPO/312-383 IYISNL......PLNLGEDQVVELLK.....P.FGDL..LSFQLIKN.............IADGSSKGFCFCEFKNPSDAEVAISGLDG.K...DTYGNKLHA
164 U2AG_HUMAN/67-142 CAVSDVEM..QEHYDEFFEEVFTEME.....EKYGEV..EEMNVCDN..............LGDHLVGNVYVKFRREEDAEKAVIDLNN.R...WFNGQPIHA
165 WHI3_YEAST/540-614 LYVGNL......PSDATEQELRQLFS.....G.QEGF..RRLSFRNK..........NTTSNGHSHGPMCFVEFDDVSFATRALAELYG.R...QLPRSTVSS
166 X16_HUMAN/12-78 VYVGNL......GNNGNKTELERAFG.....Y.YGPL..RSVWVARN..................PPGFAFVEFEDPRDAADAVRELDG.R...TLCGCRVRV
167 YHC4_YEAST/348-415 IFVGQL......DKETTREELNRRFS.....T.HGKI..QDINLIFK.................PTNIFAFIKYETEEAAAAALESENH.A...IFLNKTMHV
168 YHH5_YEAST/315-384 ILVKNL......PSDTTQEEVLDYFS.....T.IGPI..KSVFISEK...............QANTPHKAFVTYKNEEESKKAQKCLNK.T...IFKNHTIWV
169 YIS1_YEAST/66-136 IFVGNI......TPDVTPEQIEDHFK.....D.CGQI..KRITLLYD.............RNTGTPKGYGYIEFESPAYREKALQ.LNG.G...ELKGKKIAV
170 YIS5_YEAST/33-104 IYIGNL......NRELTEGDILTVFS.....E.YGVP..VDVILSRD.............ENTGESQGFAYLKYEDQRSTILAVDNLNG.F...KIGGRALKI
171 ARP2_PLAFA/364-438 VEVTYLF....STYLVNGQTL..IYS.....N.ISVV....LVILY........HQKFKETVLGRNSGFGFVSYDNVISAQHAIQFMNG.Y...FVNNKYLKV
172 CABA_MOUSE/77-147 MFVGGL......SWDTSKKDLKDYFT.....K.FGEV..VDCTIKMD.............PNTGRSRGFGFILFKDSSSVEKVLD.QKE.H...RLDGRVIDP
173 CABA_MOUSE/161-231 IFVGGL......NPEATEEKIREYFG.....Q.FGEI..EAIELPID.............PKLNKRRGFVFITFKEEDPVKKVLE.KKF.H...TVSGSKCEI
174 CPO_DROME/453-526 LFVSGL......PMDAKPRELYLLFR.....A.YEGY..EGSLLKV............TSKNGKTASPVGFVTFHTRAGAEAAKQDLQGVR...FDPDMPQTI
175 CST2_HUMAN/18-89 VFVGNI......PYEATEEQLKDIFS.....E.VGPV..VSFRLVYD.............RETGKPKGYGFCEYQDQETALSAMRNLNG.R...EFSGRALRV
176 D111_ARATH/281-360 LLLRNMVG.PGQVDDELEDEVGGECA.....K.YGTV..TRVLIFE..........ITEPNFPVHEAVRIFVQFSRPEETTKALVDLDG.R...YFGGRTVRA
177 ELAV_DROME/250-322 LYVSGL......PKTMTQQELEAIFA.....P.FGAI..ITSRILQN............AGNDTQTKGVGFIRFDKREEATRAIIALNG.T...TPSSCTDPI
178 ELAV_DROME/404-475 IFIYNL......APETEEAALWQLFG.....P.FGAV..QSVKIVKD.............PTTNQCKGYGFVSMTNYDEAAMAIRALNG.Y...TMGNRVLQV
179 EWS_HUMAN/363-442 IYVQGL......NDSVTLDDLADFFK.....Q.CGVV..K.MNKRTG....QPMIHIYLDKETGKPKGDATVSYEDPPTAKAAVEWFDG.K...DFQGSKLKV
180 GBP2_YEAST/124-193 IFVRNL......TFDCTPEDLKELFG.....T.VGEV..VEADIIT...............SKGHHRGMGTVEFTKNESVQDAISKFDG.A...LFMDRKLMV
181 GBP2_YEAST/221-291 VFIINL......PYSMNWQSLKDMFK.....E.CGHV..LRADVELD..............FNGFSRGFGSVIYPTEDEMIRAIDTFNG.M...EVEGRVLEV
182 GBP2_YEAST/351-421 IYCSNL......PFSTARSDLFDLFG.....P.IGKI..NNAELKP..............QENGQPTGVAVVEYENLVDADFCIQKLNN.Y...NYGGCSLQI