Merge branch 'master' into sim-target-tree (r25924)
[kugel-rb.git] / tools / wn2rdf.pl
blob2fff87d66be1b5d66deed4a525bebebfdc1eded1
1 #! /usr/bin/perl -w
3 # Wordnet dictionary database converter
5 # Converts the Wordnet prolog data to rockbox dictionary format.
7 # Written by Miika Pekkarinen <slasher@ihme.org>
9 # $Id$
11 use strict;
13 # Lookup tables
14 my %words;
15 my %descriptions;
17 sub getcatname {
18 my ($id) = @_;
20 return 'N' if $id == 1;
21 return 'V' if $id == 2;
22 return 'A' if $id == 3;
23 return 'A' if $id == 4;
24 return '?';
27 open IN_WORD, "wn_s.pl" or die "Open fail(#1): $!";
28 open IN_DESC, "wn_g.pl" or die "Open fail(#2): $!";
29 open OUTPUT, "> dict.preparsed" or die "Open fail(#3): $!";
31 print "Reading word file...\n";
33 # Read everything into memory
34 while (<IN_WORD>) {
35 chomp ;
37 # s(100001740,1,'entity',n,1,11). => 100001740,1,'entity',n,1,11
38 s/(^s\()(.*)(\)\.$)/$2/;
40 my ($seqid, $n1, $word, $n2, $n3, $n4) = split /,/, $_, 6;
42 # 'entity' => entity
43 $word =~ s/(^\')(.*)(\'$)/$2/;
44 $word =~ s/\'\'/\'/s;
46 my $category = substr $seqid, 0, 1;
48 $words{lc $word}{$seqid} = $category;
51 close IN_WORD;
53 print "Reading description file...\n";
54 while (<IN_DESC>) {
55 chomp ;
57 # g(100002056,'(a separate and self-contained entity)').
58 # => 100002056,'(a separate and self-contained entity)'
59 s/(^g\()(.*)(\)\.$)/$2/;
61 my ($seqid, $desc) = split /,/, $_, 2;
63 $desc =~ s/(^\'\()(.*)(\)\'$)/$2/;
64 $desc =~ s/\'\'/\'/s;
66 $descriptions{$seqid} = $desc;
69 close IN_DESC;
71 print "Sorting and writing output...\n";
73 # Now sort and find correct descriptions
74 foreach my $word (sort keys %words) {
75 my %categories;
77 # Find all definitions of the word
78 foreach my $id (keys %{$words{$word}}) {
79 my $catid = $words{$word}{$id};
80 my $description = $descriptions{$id};
82 if (!defined($description) or $description eq '') {
83 print "Error: Failed to link word: $word / ",
84 $words{$word}, "\n";
85 exit 1;
88 push @{$categories{$catid}}, $description;
91 my $finaldesc;
93 # 1 = noun
94 # 2 = verb
95 # 3 = adjective
96 # 4 = adverb
97 for my $catid (1 .. 4) {
98 my $n = 1;
99 my $catdesc;
101 next unless $categories{$catid};
102 foreach my $desc ( @{$categories{$catid}} ) {
103 $catdesc .= " " if $catdesc;
104 $catdesc .= "$n. $desc";
105 $n++;
108 next unless $catdesc;
109 $finaldesc .= "\t" if $finaldesc;
110 $finaldesc .= getcatname($catid) . ": $catdesc"
113 die "Internal error" unless $finaldesc;
115 print OUTPUT "$word\t$finaldesc\n";
118 close OUTPUT;
120 print "Done, output was successfully written!\n";