// // updataout.cpp: version 1.400 (2021/10/10). // // This is a program that generates srell_updata.hpp from: // DerivedCoreProperties.txt // DerivedNormalizationProps.txt // emoji-data.txt // PropList.txt // ScriptExtensions.txt // Scripts.txt // UnicodeData.txt // provided by the Unicode Consortium. The latese versions of them // except emoji-data.txt are available at: http://www.unicode.org/Public/UNIDATA/ // emoji-data.txt is available at: http://www.unicode.org/Public/UNIDATA/emoji/ // #include #include #include #include #include #include #include #include "../srell.hpp" #if defined(_MSC_VER) && _MSC_VER >= 1400 #pragma warning(disable:4996) #endif namespace updata { static const char *const property_names[] = { // 3 "General_Category:gc", "Script:sc", "Script_Extensions:scx", "" }; static const char *const binary_property_names[] = { // 53 (52+1) // *1: http://unicode.org/reports/tr18/#General_Category_Property // *2: 9th field in UnicodeData.txt "ASCII", // *1 "ASCII_Hex_Digit:AHex", // PropList.txt "Alphabetic:Alpha", // DerivedCoreProperties.txt "Any", // *1 "Assigned", // *1 "Bidi_Control:Bidi_C", // PropList.txt "Bidi_Mirrored:Bidi_M", // *2 "Case_Ignorable:CI", // DerivedCoreProperties.txt "Cased", // DerivedCoreProperties.txt "Changes_When_Casefolded:CWCF", // DerivedCoreProperties.txt "Changes_When_Casemapped:CWCM", // DerivedCoreProperties.txt "Changes_When_Lowercased:CWL", // DerivedCoreProperties.txt "Changes_When_NFKC_Casefolded:CWKCF", // DerivedNormalizationProps.txt "Changes_When_Titlecased:CWT", // DerivedCoreProperties.txt "Changes_When_Uppercased:CWU", // DerivedCoreProperties.txt "Dash", // PropList.txt "Default_Ignorable_Code_Point:DI", // DerivedCoreProperties.txt "Deprecated:Dep", // PropList.txt "Diacritic:Dia", // PropList.txt "Emoji", // emoji-data.txt "Emoji_Component:EComp", // emoji-data.txt "Emoji_Modifier:EMod", // emoji-data.txt "Emoji_Modifier_Base:EBase", // emoji-data.txt "Emoji_Presentation:EPres", // emoji-data.txt "Extended_Pictographic:ExtPict", // emoji-data.txt "Extender:Ext", // PropList.txt "Grapheme_Base:Gr_Base", // DerivedCoreProperties.txt "Grapheme_Extend:Gr_Ext", // DerivedCoreProperties.txt "Hex_Digit:Hex", // PropList.txt "IDS_Binary_Operator:IDSB", // PropList.txt "IDS_Trinary_Operator:IDST", // PropList.txt "ID_Continue:IDC", // DerivedCoreProperties.txt "ID_Start:IDS", // DerivedCoreProperties.txt "Ideographic:Ideo", // PropList.txt "Join_Control:Join_C", // PropList.txt "Logical_Order_Exception:LOE", // PropList.txt "Lowercase:Lower", // DerivedCoreProperties.txt "Math", // DerivedCoreProperties.txt "Noncharacter_Code_Point:NChar", // PropList.txt "Pattern_Syntax:Pat_Syn", // PropList.txt "Pattern_White_Space:Pat_WS", // PropList.txt "Quotation_Mark:QMark", // PropList.txt "Radical", // PropList.txt "Regional_Indicator:RI", // PropList.txt "Sentence_Terminal:STerm", // PropList.txt "Soft_Dotted:SD", // PropList.txt "Terminal_Punctuation:Term", // PropList.txt "Unified_Ideograph:UIdeo", // PropList.txt "Uppercase:Upper", // DerivedCoreProperties.txt "Variation_Selector:VS", // PropList.txt "White_Space:space", // PropList.txt "XID_Continue:XIDC", // DerivedCoreProperties.txt "XID_Start:XIDS", // DerivedCoreProperties.txt // ECMAScript 2019/Unicode 11: // "Extended_Pictographic:ExtPict", // ECMAScript 2021/Unicode 13: // Aliases: EComp, EMod, EBase, EPres, and ExtPict "" }; static const char *const gc_values[] = { // 38 "Other:C", "Control:Cc:cntrl", "Format:Cf", "Unassigned:Cn", "Private_Use:Co", "Surrogate:Cs", "Letter:L", "Cased_Letter:LC", "Lowercase_Letter:Ll", "Titlecase_Letter:Lt", "Uppercase_Letter:Lu", "Modifier_Letter:Lm", "Other_Letter:Lo", "Mark:M:Combining_Mark", "Spacing_Mark:Mc", "Enclosing_Mark:Me", "Nonspacing_Mark:Mn", "Number:N", "Decimal_Number:Nd:digit", "Letter_Number:Nl", "Other_Number:No", "Punctuation:P:punct", "Connector_Punctuation:Pc", "Dash_Punctuation:Pd", "Close_Punctuation:Pe", "Final_Punctuation:Pf", "Initial_Punctuation:Pi", "Other_Punctuation:Po", "Open_Punctuation:Ps", "Symbol:S", "Currency_Symbol:Sc", "Modifier_Symbol:Sk", "Math_Symbol:Sm", "Other_Symbol:So", "Separator:Z", "Line_Separator:Zl", "Paragraph_Separator:Zp", "Space_Separator:Zs", "" }; static const char *const script_names[] = { // 161 (141+7+4+4+5) "Adlam:Adlm", "Ahom:Ahom", "Anatolian_Hieroglyphs:Hluw", "Arabic:Arab", "Armenian:Armn", "Avestan:Avst", "Balinese:Bali", "Bamum:Bamu", "Bassa_Vah:Bass", "Batak:Batk", "Bengali:Beng", "Bhaiksuki:Bhks", "Bopomofo:Bopo", "Brahmi:Brah", "Braille:Brai", "Buginese:Bugi", "Buhid:Buhd", "Canadian_Aboriginal:Cans", "Carian:Cari", "Caucasian_Albanian:Aghb", "Chakma:Cakm", "Cham:Cham", "Cherokee:Cher", "Chorasmian:Chrs", "Common:Zyyy", "Coptic:Copt:Qaac", "Cypro_Minoan:Cpmn", "Cuneiform:Xsux", "Cypriot:Cprt", "Cyrillic:Cyrl", "Deseret:Dsrt", "Devanagari:Deva", "Dives_Akuru:Diak", "Dogra:Dogr", "Duployan:Dupl", "Egyptian_Hieroglyphs:Egyp", "Elbasan:Elba", "Elymaic:Elym", "Ethiopic:Ethi", "Georgian:Geor", "Glagolitic:Glag", "Gothic:Goth", "Grantha:Gran", "Greek:Grek", "Gujarati:Gujr", "Gunjala_Gondi:Gong", "Gurmukhi:Guru", "Han:Hani", "Hangul:Hang", "Hanifi_Rohingya:Rohg", "Hanunoo:Hano", "Hatran:Hatr", "Hebrew:Hebr", "Hiragana:Hira", "Imperial_Aramaic:Armi", "Inherited:Zinh:Qaai", "Inscriptional_Pahlavi:Phli", "Inscriptional_Parthian:Prti", "Javanese:Java", "Kaithi:Kthi", "Kannada:Knda", "Katakana:Kana", "Kayah_Li:Kali", "Kharoshthi:Khar", "Khitan_Small_Script:Kits", "Khmer:Khmr", "Khojki:Khoj", "Khudawadi:Sind", "Lao:Laoo", "Latin:Latn", "Lepcha:Lepc", "Limbu:Limb", "Linear_A:Lina", "Linear_B:Linb", "Lisu:Lisu", "Lycian:Lyci", "Lydian:Lydi", "Mahajani:Mahj", "Makasar:Maka", "Malayalam:Mlym", "Mandaic:Mand", "Manichaean:Mani", "Marchen:Marc", "Masaram_Gondi:Gonm", "Medefaidrin:Medf", "Meetei_Mayek:Mtei", "Mende_Kikakui:Mend", "Meroitic_Cursive:Merc", "Meroitic_Hieroglyphs:Mero", "Miao:Plrd", "Modi:Modi", "Mongolian:Mong", "Mro:Mroo", "Multani:Mult", "Myanmar:Mymr", "Nabataean:Nbat", "Nandinagari:Nand", "New_Tai_Lue:Talu", "Newa:Newa", "Nko:Nkoo", "Nushu:Nshu", "Nyiakeng_Puachue_Hmong:Hmnp", "Ogham:Ogam", "Ol_Chiki:Olck", "Old_Hungarian:Hung", "Old_Italic:Ital", "Old_North_Arabian:Narb", "Old_Permic:Perm", "Old_Persian:Xpeo", "Old_Sogdian:Sogo", "Old_South_Arabian:Sarb", "Old_Turkic:Orkh", "Old_Uyghur:Ougr", "Oriya:Orya", "Osage:Osge", "Osmanya:Osma", "Pahawh_Hmong:Hmng", "Palmyrene:Palm", "Pau_Cin_Hau:Pauc", "Phags_Pa:Phag", "Phoenician:Phnx", "Psalter_Pahlavi:Phlp", "Rejang:Rjng", "Runic:Runr", "Samaritan:Samr", "Saurashtra:Saur", "Sharada:Shrd", "Shavian:Shaw", "Siddham:Sidd", "SignWriting:Sgnw", "Sinhala:Sinh", "Sogdian:Sogd", "Sora_Sompeng:Sora", "Soyombo:Soyo", "Sundanese:Sund", "Syloti_Nagri:Sylo", "Syriac:Syrc", "Tagalog:Tglg", "Tagbanwa:Tagb", "Tai_Le:Tale", "Tai_Tham:Lana", "Tai_Viet:Tavt", "Takri:Takr", "Tamil:Taml", "Tangsa:Tnsa", "Tangut:Tang", "Telugu:Telu", "Thaana:Thaa", "Thai:Thai", "Tibetan:Tibt", "Tifinagh:Tfng", "Tirhuta:Tirh", "Toto", "Ugaritic:Ugar", "Vai:Vaii", "Vithkuqi:Vith", "Wancho:Wcho", "Warang_Citi:Wara", "Yezidi:Yezi", "Yi:Yiii", "Zanabazar_Square:Zanb", // ECMAScript 2019/Unicode 11: // "Dogra:Dogr", "Gunjala_Gondi:Gong", "Hanifi_Rohingya:Rohg", // "Makasar:Maka", "Medefaidrin:Medf", "Old_Sogdian:Sogo", "Sogdian:Sogd", // ECMAScript 2020/Unicode 12 // "Elymaic:Elym", "Nandinagari:Nand", "Nyiakeng_Puachue_Hmong:Hmnp", "Wancho:Wcho", // ECMAScript 2021/Unicode 13 // "Chorasmian:Chrs", "Dives_Akuru:Diak", "Khitan_Small_Script:Kits", "Yezidi:Yezi", // ECMAScript 2022/Unicode 14: // "Cypro_Minoan:Cpmn", "Old_Uyghur:Ougr", "Tangsa:Tnsa", "Toto", // "Vithkuqi:Vith" "" }; } // namespace updata namespace unishared { template std::string stringify(const Type value, const char *const fmt) { char buffer[BufSize]; std::sprintf(buffer, fmt, value); return std::string(buffer); } void throw_error(const char *const s, ...) { char buffer[256]; va_list va; va_start(va, s); std::vsprintf(buffer, s, va); va_end(va); throw std::runtime_error(buffer); } void read_file(std::string &str, const char *const filename, const char *const dir) { const std::string path(std::string(dir ? dir : "") + filename); FILE *const fp = std::fopen(path.c_str(), "r"); std::fprintf(stdout, "Reading '%s'... ", path.c_str()); if (fp) { static const std::size_t bufsize = 4096; char *const buffer = static_cast(std::malloc(bufsize)); if (buffer) { for (;;) { const std::size_t size = std::fread(buffer, 1, bufsize, fp); if (!size) break; str.append(buffer, size); } std::fclose(fp); std::fputs("done.\n", stdout); std::free(buffer); return; } } std::fputs("failed...", stdout); throw_error("could not open!"); } bool write_file(const char *const filename, const std::string &str) { FILE *const fp = std::fopen(filename, "wb"); std::fprintf(stdout, "Writing '%s'... ", filename); if (fp) { const bool success = std::fwrite(str.c_str(), 1, str.size(), fp) == str.size(); std::fclose(fp); if (success) { std::fputs("done.\n", stdout); return true; } } std::fputs("failed...\n", stdout); return false; } } // namespace unishared struct up_options { const char *outfilename; const char *indir; int version; int errorno; up_options(const int argc, const char *const *const argv) : outfilename("srell_updata.hpp") , indir("") , version(110) , errorno(0) { for (int index = 1; index < argc; ++index) { const char firstchar = argv[index][0]; if (firstchar == '-' || firstchar == '/') { const char *const option = argv[index] + 1; if (std::strcmp(option, "o") == 0) { if (index + 1 >= argc) goto NO_ARGUMENT; outfilename = argv[++index]; } else if (std::strcmp(option, "v") == 0) { if (index + 1 >= argc) goto NO_ARGUMENT; version = static_cast(std::strtod(argv[++index], NULL) * 100.0); } else if (std::strcmp(option, "id") == 0) { if (index + 1 >= argc) goto NO_ARGUMENT; indir = argv[++index]; } else goto UNKNOWN_OPTION; continue; NO_ARGUMENT: std::fprintf(stdout, "[Error] no argument for \"%s\" specified.\n", argv[index]); errorno = -2; } else { UNKNOWN_OPTION: std::fprintf(stdout, "[Error] unknown option \"%s\" found.\n", argv[index]); errorno = -1; } } } }; // struct up_options class unicode_property { public: unicode_property() : re_licenseline_("^#\\s*(.*)$") , re_licenseend_("^#\\s*$") { } int create_updata(std::string &outdata, const up_options &opts) { int errorno = opts.errorno; const char *const unidatafilename = "UnicodeData.txt"; const char *const propdatafiles[] = { "PropList.txt", "DerivedCoreProperties.txt", "emoji-data.txt", "DerivedNormalizationProps.txt", "" }; const char *const scfilename = "Scripts.txt"; const char *const scxfilename = "ScriptExtensions.txt"; std::string licensetext; rangeholder general_category_values; rangeholder binary_properties; rangeholder scripts; rangeholder scriptextensions; sortedrangeholder combined_properties; // scriptnameholder ucs_to_scriptname; // codepoint->scriptname. if (errorno) return errorno; try { licensetext = "// "; licensetext += unidatafilename; licensetext += "\n//\n"; read_unidata(general_category_values, binary_properties, unidatafilename, opts.indir); set_additionalbinprops(binary_properties, general_category_values); // for ASCII, Any, Cn. create_compositecategories(general_category_values); // This needs "Cn". read_binprops(binary_properties, licensetext, propdatafiles, opts.indir); read_scripts(scripts, /* ucs_to_scriptname, */ licensetext, scfilename, opts.indir); scriptextensions = scripts; modify_for_scx(scriptextensions, /* ucs_to_scriptname, */ licensetext, scxfilename, opts.indir); combine_properties(combined_properties, general_category_values, "gc", updata::gc_values); combine_properties(combined_properties, binary_properties, "bp", updata::binary_property_names); combine_properties(combined_properties, scripts, "sc", updata::script_names); combine_properties(combined_properties, scriptextensions, "scx", updata::script_names); do_formatting(outdata, combined_properties, opts.version); licensetext.append(1, '\n'); outdata.insert(0, licensetext); } catch (std::runtime_error &e) { std::printf("\nError: %s\n", e.what()); errorno = 1; } return errorno; } private: typedef srell::regex_internal::uchar32 uchar32; typedef srell::regex_internal::range_pairs ucprange_array; typedef srell::regex_internal::range_pair ucprange; typedef srell::regex_internal::range_pair_helper ucprange_helper; typedef std::map rangeholder; // typedef srell::regex_internal::bitset<0x110000> ucsset; typedef std::vector strings_type; typedef std::map scriptnameholder; typedef std::map name_mapper; typedef name_mapper canonicalname_mapper; static const uchar32 invalid_u32value = srell::regex_internal::constants::invalid_u32value; static const uchar32 compositeclass = invalid_u32value; struct sorted_name_and_ranges { std::string ptype; std::string canonicalname; std::string namealiases; ucprange_array ucpranges; }; typedef std::vector sortedrangeholder; void split(strings_type &parts, const std::string &data, const char splitter) { std::string::size_type readpos = 0; for (;;) { std::string::size_type lineend = data.find(splitter, readpos); if (lineend == std::string::npos) { // if (readpos == data.size()) // break; parts.push_back(data.substr(readpos, data.size() - readpos)); break; } parts.push_back(data.substr(readpos, lineend - readpos)); ++lineend; readpos = lineend; } } std::string join(const char c, const strings_type &parts, const bool add_final_also = false) { std::string out; for (strings_type::size_type i = 0; i < parts.size(); ++i) out.append(parts[i] + c); if (!add_final_also && out.size()) out.resize(out.size() - 1); return out; } void read_unidata(rangeholder &gc, rangeholder &bp, const char *const unidatafilename, const char *const indir) { const srell::regex re_dataline("^([0-9A-F]+);([^;]*);(([^;]*);(?:[^;]*;){6}([^;]*)(?:;[^;]*){5})$"); const srell::regex re_rangefirst("^<(.*), First>$"); const std::string stringY("Y"); const std::string stringN("N"); uchar32 prevucp = invalid_u32value; std::string data; strings_type lines; srell::cmatch cmatch; // strings_type parts; std::string rangename; std::string rangefirstproperty; uchar32 rangefirstcp = 0; ucprange range; ucprange_array bidi_mirrored_ranges; unishared::read_file(data, unidatafilename, indir); split(lines, data, '\n'); for (strings_type::size_type i = 0; i < lines.size(); ++i) { const std::string &line = lines[i]; const char *const lineend = line.c_str() + line.size(); if (srell::regex_match(line.c_str(), lineend, cmatch, re_dataline)) { const srell::cmatch::value_type &codepoint = cmatch[1]; const srell::cmatch::value_type &name = cmatch[2]; const std::string name_string(name.str()); const std::string property(cmatch[3].str()); range.first = range.second = static_cast(std::strtol(codepoint.first, NULL, 16)); if (prevucp >= range.first && prevucp != invalid_u32value) unishared::throw_error("out of order: %.4lX >= %.4lX", prevucp, range.first); // parts.clear(); // split(parts, property, ';'); // if (parts.size() != 13) // unishared::throw_error("number of fields is not 13, but %u\n\t[%s]", parts.size(), line.c_str()); // const std::string &general_category = parts[0]; // const std::string &bidi_mirrored = parts[7]; const std::string general_category(cmatch[4].str()); const std::string bidi_mirrored(cmatch[5].str()); prevucp = range.first; if (rangename.size()) { if (name_string.compare("<" + rangename + ", Last>") != 0) unishared::throw_error("<%s, Last> does not follow its First line.\n\t%s follows insteadly.", rangename.c_str(), name_string.c_str()); if (property != rangefirstproperty) { unishared::throw_error("\"%s\": properties of First and Last are different.\n\tFirst: %s\n\tLast: %s", rangename.c_str(), rangefirstproperty.c_str(), property.c_str()); } range.first = rangefirstcp; rangename.clear(); } else if (srell::regex_match(name.first, name.second, cmatch, re_rangefirst)) { rangename = cmatch[1]; rangefirstproperty = property; rangefirstcp = range.first; continue; } // register "general_category" value. gc[general_category].join(range); // register "bidi_mirrored" value. if (bidi_mirrored == stringY) { bidi_mirrored_ranges.join(range); } else if (bidi_mirrored != stringN) unishared::throw_error("unknown Bidi_Mirrored value [%s] in %s.", bidi_mirrored.c_str(), line.c_str()); } else if (line.size()) unishared::throw_error("unknown format [%s]", line.c_str()); } bp["Bidi_Mirrored"] = bidi_mirrored_ranges; } // binary properties created from UnicodeData.txt. void set_additionalbinprops(rangeholder &bp, rangeholder &gc) { ucprange_array assigned_ranges; for (rangeholder::iterator it = gc.begin(); it != gc.end(); ++it) assigned_ranges.merge(it->second); bp["Any"].join(ucprange_helper(0x0000, 0x10ffff)); bp["ASCII"].join(ucprange_helper(0x0000, 0x007f)); bp["Assigned"]; // Only creates. No data. // bp["Assigned"] = assigned_ranges; assigned_ranges.negation(); gc["Cn"] = assigned_ranges; } void create_compositecategory(rangeholder &gc, const char *const newname, const char *const *categories) { ucprange_array array; uchar32 total = 0; array.append_newpair(ucprange_helper(compositeclass, 0)); for (; **categories; ++categories) { const char *const c = *categories; const uchar32 count = static_cast(gc[*categories].size()); array.append_newpair(ucprange_helper(c[0], c[1])); array.append_newpair(ucprange_helper(count, 0)); total += count; } array[0].second = total; gc[newname] = array; } void create_compositecategories(rangeholder &gc) { const char *const categoryLC[] = { "Ll", "Lt", "Lu", "" }; const char *const categoryL[] = { "Ll", "Lt", "Lu", "Lm", "Lo", "" }; const char *const categoryM[] = { "Mc", "Me", "Mn", "" }; const char *const categoryN[] = { "Nd", "Nl", "No", "" }; const char *const categoryC[] = { "Cc", "Cf", "Cn", "Co", "Cs", "" }; const char *const categoryP[] = { "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps", "" }; const char *const categoryZ[] = { "Zl", "Zp", "Zs", "" }; const char *const categoryS[] = { "Sc", "Sk", "Sm", "So", "" }; create_compositecategory(gc, "LC", categoryLC); create_compositecategory(gc, "L", categoryL); create_compositecategory(gc, "M", categoryM); create_compositecategory(gc, "N", categoryN); create_compositecategory(gc, "C", categoryC); create_compositecategory(gc, "P", categoryP); create_compositecategory(gc, "Z", categoryZ); create_compositecategory(gc, "S", categoryS); } void read_files(std::string &out, const char *const *propdatafiles, const char separator, const char *const indir) { for (; **propdatafiles; ++propdatafiles) { std::string data; unishared::read_file(data, *propdatafiles, indir); out.append(data); out.append(1, separator); out.append(1, '\n'); } } void read_binprops(rangeholder &bp, std::string &licensetext, const char *const *propdatafiles, const char *const indir) { const srell::regex re_propfmt("^\\s*([0-9A-Fa-f]{4,})(?:\\.\\.([0-9A-Fa-f]{4,}))?\\s*;\\s*(\\S+)\\s*(#.*)?$"); const char separator = '\0'; bool licenseend_found = false; ucprange range; strings_type lines; srell::cmatch cmatch; { std::string alldata; read_files(alldata, propdatafiles, separator, indir); split(lines, alldata, '\n'); } for (std::string::size_type i = 0; i < lines.size(); ++i) { const std::string &line = lines[i]; const char *const lineend = line.c_str() + line.size(); if (!licenseend_found) { if (srell::regex_match(line.c_str(), lineend, cmatch, re_licenseend_)) { licensetext += "//\n"; licenseend_found = true; } else if (srell::regex_match(line.c_str(), lineend, cmatch, re_licenseline_)) { licensetext += "// " + cmatch[1].str() + '\n'; } } else if (line.size() == 1 && line[0] == separator) { licenseend_found = false; } else if (srell::regex_match(line.c_str(), lineend, cmatch, re_propfmt)) { const srell::cmatch::value_type &begin = cmatch[1]; const srell::cmatch::value_type &end = cmatch[2]; const srell::cmatch::value_type &property = cmatch[3]; // const srell::cmatch::value_type &comment = cmatch[4]; range.first = static_cast(std::strtol(begin.first, NULL, 16)); if (end.matched) range.second = static_cast(std::strtol(end.first, NULL, 16)); else range.second = range.first; bp[property].join(range); } } } void read_scripts(rangeholder &sc, /* scriptnameholder &ucs_to_scriptname, */ std::string &licensetext, const char *const filename, const char *const indir) { const srell::regex re_scriptdata("^\\s*([0-9A-Fa-f]{4,})(?:\\.\\.([0-9A-Fa-f]{4,}))?\\s*;\\s*(\\S+)\\s*(#.*)?$"); bool licenseend_found = false; ucprange range; strings_type lines; srell::cmatch cmatch; { std::string data; unishared::read_file(data, filename, indir); split(lines, data, '\n'); } for (std::string::size_type i = 0; i < lines.size(); ++i) { const std::string &line = lines[i]; const char *const lineend = line.c_str() + line.size(); if (!licenseend_found) { if (srell::regex_match(line.c_str(), lineend, cmatch, re_licenseend_)) { licensetext += "//\n"; licenseend_found = true; } else if (srell::regex_match(line.c_str(), lineend, cmatch, re_licenseline_)) { licensetext += "// " + cmatch[1].str() + '\n'; } } else if (srell::regex_match(line.c_str(), lineend, cmatch, re_scriptdata)) { const srell::cmatch::value_type &begin = cmatch[1]; const srell::cmatch::value_type &end = cmatch[2]; const srell::cmatch::value_type &scriptname = cmatch[3]; // const srell::cmatch::value_type &comment = cmatch[4]; range.first = static_cast(std::strtol(begin.first, NULL, 16)); if (end.matched) range.second = static_cast(std::strtol(end.first, NULL, 16)); else range.second = range.first; sc[scriptname].join(range); } } } canonicalname_mapper load_canonicalnames(const char *const *names) { canonicalname_mapper canonicalnames; strings_type parts; for (; **names; ++names) { parts.clear(); split(parts, *names, ':'); const std::string canonicalname(parts[0]); for (strings_type::size_type i = 0; i < parts.size(); ++i) { canonicalnames[parts[i]] = canonicalname; } } return canonicalnames; } void modify_for_scx(rangeholder &scx, /* scriptnameholder &ucs_to_scriptname, */ std::string &licensetext, const char *const filename, const char *const indir) { const srell::regex re_scxdata("^\\s*([0-9A-Fa-f]{4,})(?:\\.\\.([0-9A-Fa-f]{4,}))?\\s*;\\s*(\\S.*?\\S)\\s*(#.*)?$"); const canonicalname_mapper canonicalnames(load_canonicalnames(updata::script_names)); const std::string name_common("Common"); const std::string name_inherited("Inherited"); bool licenseend_found = false; ucprange_array common = scx[name_common]; ucprange_array inherited = scx[name_inherited]; ucprange range; strings_type lines; srell::cmatch cmatch; strings_type scxparts; std::map warning_out; { std::string data; unishared::read_file(data, filename, indir); split(lines, data, '\n'); } for (std::string::size_type i = 0; i < lines.size(); ++i) { const std::string &line = lines[i]; const char *const lineend = line.c_str() + line.size(); if (!licenseend_found) { if (srell::regex_match(line.c_str(), lineend, cmatch, re_licenseend_)) { licensetext += "//\n"; licenseend_found = true; } else if (srell::regex_match(line.c_str(), lineend, cmatch, re_licenseline_)) { licensetext += "// " + cmatch[1].str() + '\n'; } } else if (srell::regex_match(line.c_str(), lineend, cmatch, re_scxdata)) { const srell::cmatch::value_type &begin = cmatch[1]; const srell::cmatch::value_type &end = cmatch[2]; const srell::cmatch::value_type &scxnames = cmatch[3]; // const srell::cmatch::value_type &comment = cmatch[4]; range.first = static_cast(std::strtol(begin.first, NULL, 16)); if (end.matched) range.second = static_cast(std::strtol(end.first, NULL, 16)); else range.second = range.first; common.remove_range(range); inherited.remove_range(range); { scxparts.clear(); split(scxparts, scxnames, ' '); for (strings_type::size_type i = 0; i < scxparts.size(); ++i) { const std::string &scriptname = scxparts[i]; if (scriptname.size()) { const canonicalname_mapper::const_iterator it = canonicalnames.find(scriptname); if (it != canonicalnames.end()) scx[it->second].join(range); else { // unishared::throw_error("canonical name for \"%s\" is not found.", scriptname.c_str()); if (!warning_out.count(scriptname)) { std::printf("[Info] canonical name for \"%s\" is not found. New script?\n", scriptname.c_str()); warning_out[scriptname] = true; } } } } } } } scx[name_common] = common; scx[name_inherited] = inherited; } void combine_properties(sortedrangeholder &base, const rangeholder &addition, const char *const ptype, const char *const *aliasnames) { const canonicalname_mapper canonicalnames(load_canonicalnames(aliasnames)); sorted_name_and_ranges elem; strings_type names; for (; **aliasnames; ++aliasnames) { const std::string aliases(*aliasnames); bool pdata_found = false; names.clear(); split(names, aliases, ':'); const std::string canonicalname(names[0]); for (strings_type::size_type i = 0; i < names.size(); ++i) { const rangeholder::const_iterator it = addition.find(names[i]); if (it != addition.end()) { elem.ucpranges = it->second; pdata_found = true; break; } } if (!pdata_found) unishared::throw_error("no property value for \"%s\" found.", aliases.c_str()); elem.ptype = ptype; elem.canonicalname = canonicalname; elem.namealiases = aliases; base.push_back(elem); } } name_mapper create_ptype_mappings() { name_mapper categories; categories["gc"] = "general_category"; categories["bp"] = "binary"; categories["sc"] = "script"; categories["scx"] = "script_extensions"; return categories; } std::string create_ptypes(const name_mapper &ptypes) { std::string ptypedef("\tstruct ptype\n\t{\n\t\tstatic const T2 unknown = 0;\n"); const char *names[] = { "bp", "gc", "sc", "scx", "" }; for (unsigned int i = 0; *names[i];) { const name_mapper::const_iterator it = ptypes.find(names[i]); if (it == ptypes.end()) unishared::throw_error("name for ptype \"%s\" is not found.", names[i]); ptypedef += "\t\tstatic const T2 " + it->second + " = " + unishared::stringify<16>(++i, "%u") + ";\n"; } ptypedef += "\t};\n"; return ptypedef; } std::string ranges_to_string(const ucprange_array &array, const std::string &indent) { std::string rangestring(indent); unsigned count = 0; for (ucprange_array::size_type i = 0; i < array.size(); ++i) { const ucprange &range = array[i]; if (count == 4) { count = 0; rangestring += '\n' + indent; } else if (count) { rangestring += ' '; } rangestring += "0x" + unishared::stringify<16>(range.first, "%.4lX") + ", 0x" + unishared::stringify<16>(range.second, "%.4lX") + ','; ++count; } return rangestring; } void drop_finalcomma(std::string &data) { std::string::size_type commapos = data.rfind(','); if (commapos != std::string::npos) data.erase(commapos, 1); } std::string create_pnametable(const std::string &return_table, const int version, const std::string &template1, const std::string &template2, const std::string &indent) { std::string out(version == 100 ? "\tstatic const T3 *propertyname_table()\n\t{\n\t\tstatic const T3 table[] =\n\t\t{\n" : template1 + "const T3 " + template2 + "propertynametable[] =\n{\n"); const char *const *pnames = updata::property_names; out.append(indent + "\"*\",\t// #0:unknown\n"); out.append(indent + "\"*\",\t// #1:binary\n"); for (unsigned int i = 2; **pnames; ++pnames, ++i) { out.append(indent); out.append(1, '"'); out.append(*pnames); out.append("\",\t// #" + unishared::stringify<16>(i, "%u") + '\n'); } out.append(indent + "\"\"\n"); out.append(return_table); return out; } std::string join_dropcomma_append(const strings_type &s, const std::string &return_table) { std::string tmp(join('\n', s, true)); drop_finalcomma(tmp); tmp.append(return_table); return tmp; } void do_formatting(std::string &out, const sortedrangeholder &alldata, const int version) { typedef std::map registered_sequence; const std::string template1("template \n"); const std::string template2("unicode_property_data::"); const std::string return_table(version == 100 ? "\t\t};\n\t\treturn table;\n\t}\n" : "};\n"); const std::string indent(version == 100 ? "\t\t\t" : "\t"); name_mapper ptype_mappings(create_ptype_mappings()); const std::string ptypes(create_ptypes(ptype_mappings)); const std::string pnames(create_pnametable(return_table, version, template1, template2, indent)); std::string pnumbers("\tstatic const T1 unknown = 0;\n"); // property numbers. strings_type rangetable; strings_type lookup_ranges; unsigned int property_id_number = 1; unsigned int property_number = 1; registered_sequence registered; std::string lookup_numbers; unsigned int offset = 0; out.append(template1 + "struct unicode_property_data\n{\n"); if (version == 100) { rangetable.push_back(std::string("\tstatic const T4 *ranges()\n\t{\n\t\tstatic const T4 table[] =\n\t\t{")); lookup_ranges.push_back(std::string("\tstatic const T6 *position_table()\n\t{\n\t\tstatic const T6 table[] =\n\t\t{\n\t\t\t{ 0, 0 },\t// #0 unknown")); } else { rangetable.push_back(template1 + "const T4 " + template2 + "rangetable[] =\n{"); lookup_ranges.push_back(template1 + "const T6 " + template2 + "positiontable[] =\n{\n\t{ 0, 0 },\t// #0 unknown"); } for (sortedrangeholder::size_type i = 0; i < alldata.size(); ++i) { const sorted_name_and_ranges &elem = alldata[i]; const std::string ptype = elem.ptype; const std::string name = elem.canonicalname; const std::string aliases = elem.namealiases; const ucprange_array &array = elem.ucpranges; const std::string pnumber_keyname(ptype + '_' + name); const std::string position_comment(' ' + ptype + '=' + aliases); std::string rangestring(ranges_to_string(array, indent)); unsigned int numofranges = static_cast(array.size()); unsigned int pno = property_number; const registered_sequence::const_iterator rit = registered.find(rangestring); if (rit != registered.end()) { pno = rit->second; lookup_ranges[pno] += position_comment; rangetable[pno * 2 - 1] += position_comment; lookup_numbers.append(indent + "{ ptype::" + ptype_mappings[ptype] + ", \"" + aliases + "\", " + unishared::stringify<16>(pno, "%u") + " },\t// #" + unishared::stringify<16>(property_id_number, "%u") + "\n"); } else { // ucpranges of "Assigned" is empty. const bool compositeclass_found = array.size() && array[0].first == compositeclass; if (compositeclass_found) { std::printf("[Info] composite property \"%s\" found.\n", aliases.c_str()); numofranges = array[0].second; rangestring = indent + "// "; for (ucprange_array::size_type i = 1; i < array.size(); ++i) { const ucprange &range = array[i]; if (i > 1) rangestring += " + "; rangestring += static_cast(range.first); rangestring += static_cast(range.second); rangestring += ':' + unishared::stringify<16>(array[++i].first, "%u"); } } else { registered[rangestring] = property_number; } lookup_numbers.append(indent + "{ ptype::" + ptype_mappings[ptype] + ", \"" + aliases + "\", " + unishared::stringify<16>(property_number, "%u") + " },\t// #" + unishared::stringify<16>(property_id_number, "%u") + "\n"); lookup_ranges.push_back(indent + "{ " + unishared::stringify<16>(offset, "%u") + ", " + unishared::stringify<16>(numofranges, "%u") + " },\t// #" + unishared::stringify<16>(pno, "%u") + position_comment); rangetable.push_back(indent + "// #" + unishared::stringify<16>(pno, "%u") + " (" + unishared::stringify<16>(offset, "%u") + '+' + unishared::stringify<16>(numofranges, "%u") + "):" + position_comment); rangetable.push_back(rangestring); if (!compositeclass_found) offset += numofranges; ++property_number; } pnumbers.append("\tstatic const T1 " + pnumber_keyname + " = " + unishared::stringify<16>(property_id_number, "%u") + ";\t// #" + unishared::stringify<16>(pno, "%u") + '\n'); ++property_id_number; } pnumbers.append("\tstatic const T1 last_property_number = " + unishared::stringify<16>(--property_number, "%u") + ";\n"); lookup_numbers.append(indent + "{ ptype::unknown, \"\", 0 }\n"); lookup_numbers.append(return_table); lookup_numbers.insert(0, version == 100 ? "\tstatic const T5 *rangenumber_table()\n\t{\n\t\tstatic const T5 table[] =\n\t\t{\n\t\t\t{ ptype::unknown, \"*\", 0 },\t// #0\n" : template1 + "const T5 " + template2 + "rangenumbertable[] =\n{\n\t{ ptype::unknown, \"*\", 0 },\t// #0\n"); out.append(pnumbers); out.append(ptypes); if (version == 100) { out.append(pnames); out.append(join_dropcomma_append(rangetable, return_table)); out.append(lookup_numbers); out.append(join_dropcomma_append(lookup_ranges, return_table)); out.append("};\n"); } else { out.append("\tstatic const T3 propertynametable[];\n"); out.append("\tstatic const T4 rangetable[];\n"); out.append("\tstatic const T5 rangenumbertable[];\n"); out.append("\tstatic const T6 positiontable[];\n\n"); out.append("\tstatic const T3 *propertyname_table()\n\t{\n\t\treturn propertynametable;\n\t}\n"); out.append("\tstatic const T4 *ranges()\n\t{\n\t\treturn rangetable;\n\t}\n"); out.append("\tstatic const T5 *rangenumber_table()\n\t{\n\t\treturn rangenumbertable;\n\t}\n"); out.append("\tstatic const T6 *position_table()\n\t{\n\t\treturn positiontable;\n\t}\n"); out.append("};\n\n"); out.append(pnames); out.append("\n"); out.append(join_dropcomma_append(rangetable, return_table)); out.append("\n"); out.append(lookup_numbers); out.append("\n"); out.append(join_dropcomma_append(lookup_ranges, return_table)); } if (version > 100) out.append("#define SRELL_UPDATA_VERSION 110\n"); } srell::regex re_licenseline_; srell::regex re_licenseend_; }; // class unicode_property int main(const int argc, const char *const *const argv) { up_options upopts(argc, argv); std::string outdata; unicode_property up; int errorno = up.create_updata(outdata, upopts); if (errorno == 0) { if (!unishared::write_file(upopts.outfilename, outdata)) errorno = 2; } return errorno; }