Development

/tools/i18n-icu2dat/icu-converter.php

You must first sign up to be able to contribute.

root/tools/i18n-icu2dat/icu-converter.php

Revision 28046, 7.6 kB (checked in by FabianLange, 5 years ago)

prepared icu converter for icu 4.3 release preparing to fix some bugs (refs #7988, #8000)

Line 
1 <?php
2 /**
3  * Converts the ICU files into Prado style format:
4  * ICU: http://source.icu-project.org/repos/icu/icu/tags/milestone-4-3-4/source/data/
5  * Prado: http://code.google.com/p/prado3/source/browse/#svn/trunk/framework/I18N/core/data
6  *
7  * Created for http://www.symfony-project.org by Fabian Lange (Fabian.Lange@symfony-project.com)
8  *
9  * The generated files are directly compatible with symfony 1.3+
10  * The script can be changed to generate more backwards compatible files if desired.
11  */
12 /**
13  * How to use:
14  *  - put this file and sfYaml classes in a directory
15  *  - use spec version 1.2 for sfYaml to parse 'no' correctly as norwegian (not as false) (default for up2dte sfYaml)
16  *  - download ICU files in subdirectory called "data"
17  *  - preprocess some ICU files.
18  */
19 /**
20  * Preprocess (take care not to kill utf-8):
21  * due to line wrappings in icu files:
22  * region/es.txt line 141 HK{"Región Administrativa Especial de Hong Kong de la República Popular China"}
23  * locales/he.txt line 1788 fix date time patterns
24  * region/it.txt line 140 HK{"Regione Amministrativa Speciale di Hong Kong della Repubblica Popolare Cinese"}
25  * region/it.txt line 193 MO{"Regione Amministrativa Speciale di Macao della Repubblica Popolare Cinese"}
26  *
27  * Note: feel free to supply a patch that eliminates the need of preprocessing
28  */
29
30 // Include the symfony YAML library (can be obtained from symfony 1.3 +)
31 include_once('sfYaml.php');
32
33 function sanitize($string)
34 {
35     if(substr($string, 0,3) == pack("CCC",0xef,0xbb,0xbf))
36     {
37         $string = substr($string, 3);
38     }
39     return str_replace(array("\r\n", "\n", "\r"), "\n", $string);
40 }
41
42 // some postprocessing might result in empty arrays. we should clean them before serializing
43 function remove_emtpy_arrays($input)
44 {
45   foreach ($input as $key => $value)
46   {
47     if (is_array($value))
48     {
49       if (empty($value))
50       {
51         unset($input[$key]);
52       }
53       else
54       {
55         $value = remove_emtpy_arrays($value);
56       }
57     }
58   }
59   return $input;
60 }
61
62 // dirty hack to avoid special handling of the metazone input data
63 copy('data/misc/metazoneInfo.txt', 'data/locales/metazoneInfo.txt');
64
65 // since newer icu releases the data is split n multiple files
66 $types = array ("locales", "curr", "zone", "lang", "region");
67
68 foreach ($types as $type)
69 {
70   $files = glob("data/".$type."/*.txt");
71   foreach ($files as $filename)
72   {
73     $locale = substr($filename, 6 + strlen($type));
74     $locale = substr($locale, 0, -4);
75
76     // Step 1: convert ICU txt into yml:
77     $icu_data = file_get_contents('data/'.$type.'/'.$locale.'.txt');
78     $icu_data = sanitize($icu_data);
79     // skip Copyright - file starting with $locale{
80     $icu_data = preg_replace('/\/\/.*?('.$locale.'\{)/sm','$1', $icu_data);
81
82     // Remove Duplicated from xxx package
83     $icu_data = preg_replace ('/\{\s*\/\*\*[^\}]*\}/sm','', $icu_data);
84
85     // this should reference the current locale
86     $icu_data = str_replace('/LOCALE', $locale, $icu_data);
87     // done for BC with old prado files
88     $icu_data = str_replace('%%ALIAS','__ALIAS', $icu_data);
89
90     // original prado neither uses this, nor imports this correctly. this is a typemarker. php manages this on its own
91     // this enables sensible use of the field if required in future
92     $icu_data = str_replace(':intvector{','{ ', $icu_data);
93     $icu_data = str_replace(':int{','{ ', $icu_data);
94
95     // hack need to preserve {0} and {1} placeholders from later array conversions:
96     $icu_data = str_replace('{0}','<0>', $icu_data);
97     $icu_data = str_replace('{1}','<1>', $icu_data);
98
99     // only valid for metazone, but will remove need for manual preprocessing
100     $icu_data = str_replace('metazoneInfo:table(nofallback)', 'metazoneInfo', $icu_data);
101
102     // Step 2: make yml out of icu format
103     $yml = $icu_data;
104
105     // create array structure from csv
106     //             "R$",
107     // ->
108     //            - "R$"
109     $yml = preg_replace('/^(\s*)(.*),\s*$/m','$1- $2',$yml);
110
111     // create array structure for name elements
112     //         PT{"Portugal"}
113     // ->
114     //         PT: ["Portugal"]
115     $yml = str_replace('"}','"]', $yml);
116     $yml = str_replace('{"',': ["', $yml);
117
118     // create yml key-value pairs from { array notation
119     $yml = str_replace('{',':', $yml);
120     $yml = str_replace('}','', $yml);
121
122     // some example chars are multiline, we will remove them anyway later.
123     // for parsing them lets pretend its a string block
124     $yml = str_replace('ExemplarCharacters:','ExemplarCharacters: |', $yml);
125
126     // the original CultureInfo class simplified single element arrays into the element alone
127     // now we do this already at data creation time and remove the simplify() calles
128     // this greatly reduces file size and improces runtime performance
129     //         PT: ["Portugal"]
130     // ->
131     //         PT: "Portugal"
132     $yml = preg_replace('/\[("[^"]*")\]/','$1', $yml);
133
134     // hack need to preserve {0} and {1} placeholders from later array conversions:
135     $yml = str_replace('<0>','{0}', $yml);
136     $yml = str_replace('<1>','{1}', $yml);
137
138     // save for manual checks of generated yml
139     file_put_contents('data/'.$type.'/'.$locale.'.yml', $yml);
140   }
141 }
142
143 // we use the locales directory as input for locales, because it contains all files
144 // 'region' or other may contain less files
145 $files = glob("data/locales/*.yml");
146 foreach ($files as $filename)
147 {
148   $locale = substr($filename, 6 + strlen('locales'));
149   $locale = substr($locale, 0, -4);
150
151   // step 3: Load and Merge the YAML files and save serialized
152   $dat_data = array();
153   foreach ($types as $type)
154   {
155     $array = sfYaml::load('data/'.$type.'/'.$locale.'.yml');
156     if (is_array($array))
157     {
158       $type_data = $array[$locale];
159       if ($type == 'region' && isset($type_data['Countries']))
160       {
161         foreach ($type_data['Countries'] as $key => $country)
162         {
163           // numeric keys are regions, no countries (why are they in the data files?)
164           if (is_numeric($key)) unset($type_data['Countries'][$key]);
165         }
166         // ZZ is the unknown entry
167         unset($type_data['Countries']['ZZ']);
168       }
169       if ($type == 'curr')
170       {
171         // XXX is unknown and XTS is testing
172         unset($type_data['Currencies']['XTS']);
173         unset($type_data['Currencies']['XXX']);
174         unset($type_data['CurrencyPlurals']['XTS']);
175         unset($type_data['CurrencyPlurals']['XXX']);
176       }
177       $dat_data = array_merge($dat_data, $type_data);
178     }
179   }
180
181   // those were not in prado and seem not to make any usable sense for us
182   // we remove them to reduce file size
183   unset($dat_data['codePatterns']);
184   unset($dat_data['ExemplarCharacters']);
185   unset($dat_data['AuxExemplarCharacters']);
186   unset($dat_data['CurrencyUnitPatterns']);
187
188   // possibly remove more, but the data is actually useful
189
190   // clean any remaining empty arrays
191   $dat_data = remove_emtpy_arrays($dat_data);
192  
193   //save for import into symfony
194   file_put_contents('data/'.$locale.'.dat',serialize($dat_data));
195 }
196
197 // postprocess the metazoneInfo.dat into root.dat
198 $metazoneInfo = unserialize(file_get_contents('data/metazoneInfo.dat'));
199 $zones = array();
200 foreach ($metazoneInfo['metazoneMappings'] as $key => $value)
201 {
202   // only take last valid timezone mapping
203   $validMetazone = array_pop($value);
204   $zones[str_replace(':', '/', $key)] = $validMetazone[0];
205 }
206 // add to root file
207 $rootData = unserialize(file_get_contents('data/root.dat'));
208 $rootData['TimeZones'] = $zones;
209 file_put_contents('data/root.dat',serialize($rootData));
210 unlink('data/metazoneInfo.dat');
Note: See TracBrowser for help on using the browser.