summaryrefslogtreecommitdiff
path: root/contrib/less/mkutable
blob: 51dbab71d16136050aec6fbd2a186d4e54c6fbdd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env perl
use strict;

my $USAGE = <<__EOF__;
   usage: mkutable [-n] [-f#] type... [--] [<] UnicodeData.txt
          -n = take non-matching types
          -f = zero-based type field (default 2)
__EOF__

use Getopt::Std;
use vars qw( $opt_f $opt_n );

my $type_field = 2;

# Override Unicode tables for certain control chars
# that are expected to be found in normal text files.
my %force_space = (
    0x08 => 1, # backspace
    0x09 => 1, # tab
    0x0a => 1, # newline
    0x0c => 1, # form feed
    0x0d => 1, # carriage return
);

# Override Unicode tables for certain modifier chars which act differently
# on different terminals. Treat them as omittable.
my @force_omit = (
    [0xad,    0xad],    # SOFT HYPHEN
    [0x200d,  0x200d],  # ZERO WIDTH JOINER
    [0x1f3fb, 0x1f3ff], # EMOJI MODIFIER FITZPATRICK TYPE-[1-6]
    [0x1f9b0, 0x1f9b3], # EMOJI COMPONENT [RED,CURLY,BALD,WHITE] HAIR
    [0xfe00,  0xfe0f],  # VARIATION SELECTOR-[1-16]
    [0xe0100, 0xe01ef], # VARIATION SELECTOR-[17-256]
);

# Hangul Jamo medial vowels and final consonants should be zero width.
my @force_compose = (
    [0x1160, 0x11ff],
    [0xd7b0, 0xd7c6],
    [0xd7cb, 0xd7fb]
);

exit (main() ? 0 : 1);

sub main {
    my $args = join ' ', @ARGV;
    die $USAGE if not getopts('f:n');
    $type_field = $opt_f if $opt_f;

    my %types;
    my $arg;
    while ($arg = shift @ARGV) {
        last if $arg eq '--';
        $types{$arg} = 1;
    }
    my %out = ( 'types' => \%types );

    my %force_compose;
    foreach my $comp (@force_compose) {
        my ($lo,$hi) = @$comp;
        for (my $ch = $lo; $ch <= $hi; ++$ch) {
            $force_compose{$ch} = 1;
        }
    }
    my %force_omit;
    foreach my $comp (@force_omit) {
        my ($lo,$hi) = @$comp;
        for (my $ch = $lo; $ch <= $hi; ++$ch) {
            $force_omit{$ch} = 1;
        }
    }

    my ($sec,$min,$hour,$mday,$mon,$year) = gmtime($ENV{SOURCE_DATE_EPOCH} // time());
    my @month = ( "Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec" );
    printf "/* Generated by \"%s %s\" on %s %2d %2d:%02d:%02d GMT %d */\n",
        $0, $args, $month[$mon], $mday, $hour, $min, $sec, $year+1900;

    my $last_code = 0;
    my $start_range = 0;
    while (<>) {
        chomp;
        s/#.*//;
        my @fields = split /;/;
        next if not @fields;
        my ($lo_code, $hi_code);
        my $codes = $fields[0];
        if ($codes =~ /(\w+)\.\.(\w+)/) {
            $lo_code = hex $1;
            $hi_code = hex $2;
        } else {
            $lo_code = $hi_code = hex $codes;
        }
        if ($fields[1] =~ /, First>$/) {
            die "invalid Unicode data: First with range" if $hi_code != $lo_code;
            $start_range = $lo_code;
            next;
        }
        if ($fields[1] =~ /, Last>$/) {
            die "invalid Unicode data: Last without First" if not $start_range;
            $lo_code = $start_range;
            $start_range = 0;
        } elsif ($start_range) {
            die "invalid Unicode data: First without Last";
        }
        my $type = $fields[$type_field];
        $type =~ s/\s//g;
        for ($last_code = $lo_code; $last_code <= $hi_code; ++$last_code) {
            output(\%out, $last_code,
                $force_space{$last_code} ? 'Zs' : $force_compose{$last_code} ? 'Mn' :
                $force_omit{$last_code} ? 'Xx' : $type);
        }
    }
    output(\%out, $last_code);
    return 1;
}

sub output {
    my ($out, $code, $type) = @_;
    my $type_ok = ($type and ${${$out}{types}}{$type});
    $type_ok = not $type_ok if $opt_n;
    my $prev_code = $$out{prev_code};

    if (not $type_ok) {
        end_run($out, $prev_code);
    } elsif (not $$out{in_run} or $type ne $$out{run_type} or $code != $prev_code+1) {
        end_run($out, $prev_code);
        start_run($out, $code, $type);
    }
    $$out{prev_code} = $code;
}

sub start_run {
    my ($out, $code, $type) = @_;
    $$out{start_code} = $code;
    $$out{prev_code} = $code;
    $$out{run_type} = $type;
    $$out{in_run} = 1;
}

sub end_run {
    my ($out, $code) = @_;
    return if not $$out{in_run};
    printf "\t{ 0x%04x, 0x%04x }, /* %s */\n", $$out{start_code}, $code, $$out{run_type};
    $$out{in_run} = 0;
}