mirror of
https://github.com/goatcorp/Dalamud.git
synced 2025-12-12 10:17:22 +01:00
Implement xiv fixes into Dalamud.Boot (#857)
This commit is contained in:
parent
02dd1eddec
commit
75de126c9d
40 changed files with 41576 additions and 196 deletions
|
|
@ -5,6 +5,7 @@
|
|||
#include <filesystem>
|
||||
#include <iostream>
|
||||
#include "nethost/nethost.h"
|
||||
#include "..\..\Dalamud.Boot\logging.h"
|
||||
|
||||
CoreCLR::CoreCLR(void* calling_module)
|
||||
: m_calling_module(calling_module)
|
||||
|
|
@ -82,7 +83,7 @@ int CoreCLR::load_runtime(const std::wstring& runtime_config_path, const struct
|
|||
// Success_HostAlreadyInitialized
|
||||
if (result == 1)
|
||||
{
|
||||
printf("Success_HostAlreadyInitialized (0x1) ");
|
||||
logging::print<logging::I>("Success_HostAlreadyInitialized (0x1)");
|
||||
result = 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
#include <Windows.h>
|
||||
#include <Shlobj.h>
|
||||
#include "CoreCLR.h"
|
||||
#include "..\..\Dalamud.Boot\logging.h"
|
||||
|
||||
FILE* g_CmdStream;
|
||||
void ConsoleSetup(const std::wstring console_name)
|
||||
|
|
@ -16,6 +17,7 @@ void ConsoleSetup(const std::wstring console_name)
|
|||
freopen_s(&g_CmdStream, "CONOUT$", "w", stdout);
|
||||
freopen_s(&g_CmdStream, "CONOUT$", "w", stderr);
|
||||
freopen_s(&g_CmdStream, "CONIN$", "r", stdin);
|
||||
SetConsoleOutputCP(CP_UTF8);
|
||||
}
|
||||
|
||||
void ConsoleTeardown()
|
||||
|
|
@ -61,7 +63,7 @@ int InitializeClrAndGetEntryPoint(
|
|||
|
||||
if (result != 0)
|
||||
{
|
||||
printf("Error: Unable to get RoamingAppData path (err=%d)\n", result);
|
||||
logging::print<logging::E>("Unable to get RoamingAppData path (err={})", result);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
@ -71,13 +73,13 @@ int InitializeClrAndGetEntryPoint(
|
|||
|
||||
// =========================================================================== //
|
||||
|
||||
wprintf(L"with dotnet_path: %s\n", dotnet_path);
|
||||
wprintf(L"with config_path: %s\n", runtimeconfig_path.c_str());
|
||||
wprintf(L"with module_path: %s\n", module_path.c_str());
|
||||
logging::print<logging::I>(L"with dotnet_path: %s", dotnet_path);
|
||||
logging::print<logging::I>(L"with config_path: %s", runtimeconfig_path.c_str());
|
||||
logging::print<logging::I>(L"with module_path: %s", module_path.c_str());
|
||||
|
||||
if (!std::filesystem::exists(dotnet_path))
|
||||
{
|
||||
printf("Error: Unable to find .NET runtime path\n");
|
||||
logging::print<logging::E>("Error: Unable to find .NET runtime path");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
@ -88,13 +90,13 @@ int InitializeClrAndGetEntryPoint(
|
|||
dotnet_path,
|
||||
};
|
||||
|
||||
printf("Loading hostfxr... ");
|
||||
logging::print<logging::I>("Loading hostfxr...");
|
||||
if ((result = g_clr->load_hostfxr(&init_parameters)) != 0)
|
||||
{
|
||||
printf("\nError: Failed to load the `hostfxr` library (err=0x%08x)\n", result);
|
||||
logging::print<logging::E>("Failed to load the `hostfxr` library (err=0x{:08x})", result);
|
||||
return result;
|
||||
}
|
||||
printf("Done!\n");
|
||||
logging::print<logging::I>("Done!");
|
||||
|
||||
// =========================================================================== //
|
||||
|
||||
|
|
@ -105,17 +107,17 @@ int InitializeClrAndGetEntryPoint(
|
|||
dotnet_path,
|
||||
};
|
||||
|
||||
printf("Loading coreclr... ");
|
||||
logging::print<logging::I>("Loading coreclr... ");
|
||||
if ((result = g_clr->load_runtime(runtimeconfig_path, &runtime_parameters)) != 0)
|
||||
{
|
||||
printf("\nError: Failed to load coreclr (err=%d)\n", result);
|
||||
logging::print<logging::E>("Failed to load coreclr (err={})", result);
|
||||
return result;
|
||||
}
|
||||
printf("Done!\n");
|
||||
logging::print<logging::I>("Done!");
|
||||
|
||||
// =========================================================================== //
|
||||
|
||||
printf("Loading module... ");
|
||||
logging::print<logging::I>("Loading module...");
|
||||
if ((result = g_clr->load_assembly_and_get_function_pointer(
|
||||
module_path.c_str(),
|
||||
entrypoint_assembly_name.c_str(),
|
||||
|
|
@ -123,10 +125,10 @@ int InitializeClrAndGetEntryPoint(
|
|||
entrypoint_delegate_type_name.c_str(),
|
||||
nullptr, entrypoint_fn)) != 0)
|
||||
{
|
||||
printf("\nError: Failed to load module (err=%d)\n", result);
|
||||
logging::print<logging::E>("Failed to load module (err={})", result);
|
||||
return result;
|
||||
}
|
||||
printf("Done!\n");
|
||||
logging::print<logging::I>("Done!");
|
||||
|
||||
// =========================================================================== //
|
||||
|
||||
|
|
|
|||
1
lib/Nomade040-nmd
Submodule
1
lib/Nomade040-nmd
Submodule
|
|
@ -0,0 +1 @@
|
|||
Subproject commit 33ac3b62c7d1eb28ae6b71d4dd78aa133ef96488
|
||||
484
lib/srell3_009/history_en.txt
Normal file
484
lib/srell3_009/history_en.txt
Normal file
|
|
@ -0,0 +1,484 @@
|
|||
20220511; version 3.009:
|
||||
* Fixed an optimisation bug that caused /abcd|ab/ not to match "abc".
|
||||
|
||||
20220504; version 3.008:
|
||||
* Fixed the behaviour of [^\P{...}] when the icase flag is set, as it
|
||||
behaved similarly to the one in v-mode that has been proposed in
|
||||
TC39.
|
||||
|
||||
20220429; version 3.007:
|
||||
* Further modification to the counter mechanism.
|
||||
|
||||
20220428; version 3.006:
|
||||
* Modified the mechanism of the counter used for repetition.
|
||||
* Re-removed the implementation of linear search for small character
|
||||
classes.
|
||||
|
||||
20220424; version 3.005:
|
||||
* Fixed a bug that caused /(?<=$.*)/ not to match the end of "a" when
|
||||
the multiline flag is set
|
||||
* Preparations for \A, \z, (?m:) that have been proposed in TC39.
|
||||
|
||||
20220420; version 3.004:
|
||||
* Added a new optimisation for /A*B/ and /A+B/ where a character class
|
||||
A overlaps a character or character class B, such as /[A-Za-z]+ing/,
|
||||
/".*"/.
|
||||
|
||||
20220416; version 3.003:
|
||||
* Combined two optimisation functions into one.
|
||||
* Reduced the amount of code for lookaround (lookahead and lookbehind)
|
||||
assertions.
|
||||
|
||||
20220416; version 3.002:
|
||||
* Fixed a bug that caused regex_match or regex_search with the
|
||||
match_continuous flag being set to fail when the entry point
|
||||
selector introduced in version 3.000 was used internally.
|
||||
|
||||
20211025; version 3.001:
|
||||
* Removed the code for splitting counter as it seemed to be no effect
|
||||
or to make performance a bit worse.
|
||||
* Fixed potential bugs.
|
||||
* Minor improvements.
|
||||
|
||||
20211023; version 3.000:
|
||||
* Updated srell_ucfdata2.hpp and srell_updata.hpp to support Unicode
|
||||
14.0.0.
|
||||
* Updated unicode/updataout.cpp to support Unicode 14. (Support in
|
||||
advance new script names that are expected to be available in RegExp
|
||||
of ECMAScript 2022).
|
||||
* Changed the type used to store a Unicode value when char32_t is not
|
||||
available, from an "unsigned integer type with width of at least 21
|
||||
bits" to a "one of at least 32 bits".
|
||||
* Changed the type used to store a repetition count or character class
|
||||
number when char32_t is not available, from "unsigned int" to
|
||||
"unsigned integer type of at least 32-bit width".
|
||||
* Added overflow check in the function that translates digits into a
|
||||
numeric value. For example, while up to the previous version
|
||||
/a{0,4294967297}/ was treated as /a{0,1}/ because of overflow when
|
||||
the unsigned int type is 32-bit width, SRELL now throws error_brace
|
||||
in cases like this.
|
||||
* Fixed a bug that caused /[^;]*^;?/ not to match the beginning of an
|
||||
input string when the multiline flag is not set.
|
||||
* Implemented a very simple and limited entry point selector.
|
||||
|
||||
20211004; version 2.930:
|
||||
* Added new typedefs whose prefix is u1632w- and support UTF-16 or
|
||||
UTF-32 depending on the value of WCHAR_MAX. (When 0xFFFF <=
|
||||
WCHAR_MAX < 0x10FFFF, u1632w- types are aliases of u16w- types.
|
||||
When 0x10FFFF <= WCHAR_MAX, u1632w- types are aliases of u32w-
|
||||
types).
|
||||
* Reduced the amount of memory used for Eytzinger layout search.
|
||||
* Various improvements. (Some of them are based on suggestions to NIRE
|
||||
by Marko Njezic).
|
||||
|
||||
20210624; version 2.920:
|
||||
* Added a new optimisation for the quantifier '?' (I.e., {0,1}).
|
||||
* Changed the version number of the ECMAScript specification
|
||||
referenced in misc/sample01.cpp to 2021.
|
||||
|
||||
20210429; version 2.912:
|
||||
* Fixed another bug in the optimisation introduced in version 2.900,
|
||||
which caused /aa|a|aa/ not to match "a" (Thanks to Jan Schrötter for
|
||||
the report).
|
||||
Incidentally, this optimisation can be disabled by defining
|
||||
SRELLDBG_NO_BRANCH_OPT2 prior to including srell.hpp.
|
||||
|
||||
20210424; version 2.911:
|
||||
* Fixed a bug in the optimisation introduced in version 2.900, which
|
||||
caused /abc|ab|ac/ not to match "ac". (Thanks for the bug report [As
|
||||
my email to the reporter was rejected by the email server and
|
||||
returned, it is unclear whether mentioning the name here is okay
|
||||
with the reporter. So, I refrain]).
|
||||
|
||||
20210407; version 2.910:
|
||||
* Fixed a potential memory leak in move assignment operators used by
|
||||
the pattern compiler since 2.900. (Thanks to Michal Švec for the
|
||||
report).
|
||||
|
||||
20210214; version 2.901:
|
||||
* Removed redundant template specialisations.
|
||||
|
||||
20210214; version 2.900:
|
||||
* Added a new optimisation for the alternative expression that consist
|
||||
of string literals, such as /abc|abd|acde/.
|
||||
* Fixed the problem that brought u(8|16)[cs]regex_(token_)?iterator
|
||||
(i.e., regex (token) iterators specialised for char8_t or char16_t)
|
||||
to a compile error.
|
||||
* Minor improvements.
|
||||
|
||||
20210131; version 2.810:
|
||||
* Improved internal UTF-8 iterators.
|
||||
|
||||
20200724; version 2.800:
|
||||
* Introduced the Eytzinger layout for binary search in the character
|
||||
class.
|
||||
* Reimplemented linear search for small character classes.
|
||||
* Modified handling of the property data used for parsing the name for
|
||||
a named capturing group. Now they are loaded only when needed
|
||||
instead of being loaded into an instance of basic_regex always.
|
||||
|
||||
20200714; version 2.730:
|
||||
* Added code to prevent redundant save and restore operations when
|
||||
nested capturing round brackets are processed.
|
||||
* Improved regex_iterator.
|
||||
|
||||
20200703; version 2.720:
|
||||
* Improved case-insensitive (icase) search using the
|
||||
Boyer-Moore-Horspool algorithm for UTF-8 string that includes
|
||||
non-ASCII characters or UTF-16 string that includes non-BMP
|
||||
characters.
|
||||
* Fixed a bug that caused regex_iterator->prefix().first to point to
|
||||
the beginning of the subject string instead of the end of the
|
||||
previous match (regression introduced in version 2.650, when
|
||||
three-iterators overloads were added to regex_search()).
|
||||
* In accordance with the fix above, when a three-iterators version of
|
||||
regex_search() is called, now match_results.position() returns a
|
||||
distance from the position passed to as the lookbehind limit (3rd
|
||||
param of regex_search) and match_results.prefix().first points to
|
||||
the position passed to as the beginning of the subject string (1st
|
||||
param of regex_search).
|
||||
* Fixed a bug that could cause a valid UTF-8 sequence being adjacent
|
||||
to an invalid UTF-8 sequence to be skipped when the BMH algorithm
|
||||
was used (regression introduced in version 2.630, when UTF-8
|
||||
handling was modified).
|
||||
|
||||
20200701; version 2.710:
|
||||
* Minor modifications to Boyer-Moore-Horspool search.
|
||||
|
||||
20200630; version 2.700:
|
||||
* Optimisation adjustments.
|
||||
|
||||
20200620: version 2.651:
|
||||
* Move the group name validity check to after parsing the \u escape.
|
||||
* Updated misc/sample01.cpp to version 1.103. Changed the version
|
||||
number of the ECMAScript specification referenced by to 2020 (ES11).
|
||||
|
||||
20200618: version 2.650:
|
||||
* To element access functions in match_results, added overload
|
||||
functions for specifying the group name by a pointer.
|
||||
* When a three-iterators version of regex_search() is used, SRELL now
|
||||
sets match_results::prefix::first to the position passed to as the
|
||||
lookbehind limit (third param) instead of the position passed to as
|
||||
the beginning of the subject (first param).
|
||||
* Removed some operations that seem to be redundant.
|
||||
|
||||
20200601: version 2.643:
|
||||
* Added "inline" to operators in syntax_option_type and
|
||||
match_flag_type types, based on a report that it is needed not to
|
||||
cause the multiple definition error.
|
||||
* Minor improvements.
|
||||
|
||||
20200530: version 2.642:
|
||||
* Reduced the size of memory allocated by the basic_regex instance.
|
||||
|
||||
20200528: version 2.641:
|
||||
* The fix in 2.640 was incomplete. Fixed the optimisation bug 1 again.
|
||||
* Optimisation adjustments.
|
||||
|
||||
20200516: version 2.640:
|
||||
* Fixed an optimisation bug 1: It was possible for regex_match to pass
|
||||
the end of a subject string under certain conditions.
|
||||
* Fixed an optimisation bug 2: ^ and $ were not given a chance to
|
||||
match an appropriate position in some cases when the multiline flag
|
||||
is set to true.
|
||||
* Updated srell_ucfdata2.hpp and srell_updata.hpp.
|
||||
|
||||
20200509: version 2.630:
|
||||
* SRELL's pattern compiler no longer permits invalid UTF-8 sequences
|
||||
in regular expressions. It throws regex_utf8. (Invalid UTF-8
|
||||
sequences in the subject string are not treated as an error.)
|
||||
* Fixed BMH search functions not to include extra (invalid) UTF-8
|
||||
trailing bytes following the real matched substring, in a returned
|
||||
result.
|
||||
* Fixed minor issues: 1) basic_regex.flags() did not return the
|
||||
correct value in some cases, 2) match_results.format() did not
|
||||
replace $<NAME> with an empty string when any capturing group whose
|
||||
name is NAME did not exist.
|
||||
|
||||
20200502: version 2.620:
|
||||
* Removed methods used for match_continuous and regex_match in the
|
||||
class for the Boyer-Moore-Horspool algorithm. Now SRELL always uses
|
||||
the automaton like earlier versions when they are processed.
|
||||
* Some clean-ups.
|
||||
|
||||
20200428: version 2.611:
|
||||
* Fixed a bug that caused /\d*/ not to match the head of "abc" but to
|
||||
match the end of it. (regression introduced in version 2.210.)
|
||||
|
||||
20200426: version 2.610:
|
||||
* Fixed a bug that caused case-insensitive (icase) BMH search to skip
|
||||
a matched sequence at the beginning of the entire text, when 1)
|
||||
search is done against UTF-8 or UTF-16 text, and 2) the searched
|
||||
pattern ends with a character that consists of multiple code units
|
||||
in that encoding.
|
||||
* Now SRELL parses a capturing group name according to the ECMA
|
||||
specification and strictly checks its validity. Group names like
|
||||
/(?<,>...)/ cause regex_error.
|
||||
|
||||
20200418: version 2.600:
|
||||
* To pass to regex_search() directly the limit of a sequence until
|
||||
where the automaton can lookbehind, added three-iterators versions
|
||||
of regex_search().
|
||||
* [Breaking Change] Removed the match_lblim_avail flag from
|
||||
match_flag_type and the lookbehind_limit member from match_results
|
||||
which were added in version 2.300.
|
||||
* Updated srell_ucfdata2.hpp and srell_updata.hpp to support Unicode
|
||||
13.0.0.
|
||||
* Updated unicode/updataout.cpp to support Unicode 13. (Support in
|
||||
advance new script names that will be available in RegExp of
|
||||
ECMAScript 2020).
|
||||
|
||||
20191118: version 2.500:
|
||||
* Modified basic_regex to hold precomputed tables for icase matching,
|
||||
instead of creating them from case folding data when its instance is
|
||||
first created.
|
||||
* In accordance with the change above, srell_ucfdata.hpp and
|
||||
ucfdataout.cpp that outputs the former were replaced with
|
||||
srell_ucfdata2.hpp that holds precomputed tables and ucfdataout2.cpp
|
||||
that outputs the former.
|
||||
* Changed the method of character class matching from linear search to
|
||||
binary search.
|
||||
* Changed the timing of optimisation of a character class from "when a
|
||||
closing bracket ']' is found" to "every time a character or
|
||||
character range is pushed to its character class array".
|
||||
* Removed all asserts.
|
||||
* Modified the pattern compiler to interpret sequential \uHHHH escapes
|
||||
as a Unicode code point value if they represent a valid surrogate
|
||||
pair. (By this change, incompatibilities with the ECMAScript
|
||||
specification disappeared.)
|
||||
* Fixed the position of an endif directive that caused a compiler
|
||||
error when -DSRELL_NO_NAMEDCAPTURE is specified.
|
||||
* Updated updataout.cpp to version 1.101.
|
||||
* Added a standalone version of SRELL in the single-header directory.
|
||||
|
||||
20190914: version 2.401:
|
||||
* Reduced the size of basic_regex. (It was bloated by my carelessness
|
||||
when support for Unicode property escapes was added).
|
||||
* Improved basic_regex::swap().
|
||||
|
||||
20190907: version 2.400:
|
||||
* Improved the performance of character class matching.
|
||||
* Modified the pattern compiler to interpret the \u escape sequence in
|
||||
the group name in accordance with the ECMAScript specification.
|
||||
* Updated ucfdataout.cpp to version 1.200. A new member has been added
|
||||
to the unicode_casefolding class in srell_ucfdata.hpp that
|
||||
ucfdataout.cpp generates.
|
||||
Because SRELL 2.400 and later need this added member, they cannot be
|
||||
used with srell_ucfdata.hpp output by ucfdataout.cpp version 1.101
|
||||
or earlier. (No problem in using an older version of SRELL with a
|
||||
newer version of srell_ucfdata.hpp).
|
||||
* Some clean-ups and improvements.
|
||||
|
||||
20190902: version 2.304:
|
||||
* Fixed regex_iterator that had been broken by the code clean-up in
|
||||
version 2.303.
|
||||
|
||||
20190810: version 2.303:
|
||||
* Refixed the problem that was fixed in version 2.302 as the fix was
|
||||
incomplete.
|
||||
* Cleaned up code.
|
||||
|
||||
20190809: version 2.302:
|
||||
* Bug fix: When (?...) has a quantifier, strings captured by round
|
||||
brackets inside it were not cleared in each repetition but carried
|
||||
over to the next loop. For example,
|
||||
/(?:(ab)|(cd))+/.exec("abcd") returned ["abcd", "ab", "cd"], instead
|
||||
of ["abcd", undefined, "cd"]. (The latter is correct).
|
||||
* Updated misc/sample01.cpp to version 1.102. Rewrote the chapter
|
||||
numbers in accordance with ECMAScript 2019 (ES10).
|
||||
|
||||
20190724: version 2.301:
|
||||
* In accordance with the ECMAScript spec, restricted the characters
|
||||
which can be escaped by '\', to the following fifteen characters:
|
||||
^$\.*+?()[]{}|/
|
||||
Only in the character class, i.e., inside [], '-' also becomes a
|
||||
member of the group.
|
||||
|
||||
20190717: version 2.300:
|
||||
* Added a feature for specifying the limit until where the automaton
|
||||
can lookbehind, separated from the beginning of a target sequence.
|
||||
(Addition of the match_lblim_avail flag to match_flag_type and the
|
||||
lookbehind_limit member to match_results).
|
||||
And, lookbehind_limit of match_results being private and used
|
||||
internally in regex_iterator is also set in its constructor.
|
||||
* Removed order restriction of capturing parentheses and
|
||||
backreferences, in accordance with the ECMAScript spec. Now /\1(.)/,
|
||||
/(?<=(.)\1)/, and /\k<a>(?<a>.)/ are all okay.
|
||||
* Updated misc/sample01.cpp to version 1.101. Added one compliance
|
||||
test from misc.js.
|
||||
|
||||
20190714: version 2.230:
|
||||
* Improved the performance of searching when regular expressions begin
|
||||
with a character or character class followed by a '*' or '+'. (E.g.,
|
||||
/[A-Za-z]+ing/).
|
||||
|
||||
20190707: version 2.221:
|
||||
* Changed the feature test macro used for checking availability of
|
||||
std::u8string, from __cpp_char8_t to __cpp_lib_char8_t.
|
||||
* When icase specified, if all characters in a character class become
|
||||
the same character as a result of case-folding, the pattern compiler
|
||||
has been changed to convert the character class to the character
|
||||
literal (e.g., /r[Ss\u017F]t/i -> /rst/i).
|
||||
* Fixed a minor issue.
|
||||
|
||||
20190617: version 2.220:
|
||||
* Changed the internal representation of repetition in the case that
|
||||
it becomes more compact by not using the counter.
|
||||
* Fixed an optimisation bug that caused searching for /a{1,2}?b/
|
||||
against "aab" to return "ab" instead of "aab". (Condition: a
|
||||
character or character class with a non-greedy quantifier is
|
||||
followed by its exclusive character or character class).
|
||||
|
||||
20190613: version 2.210:
|
||||
* Improved a method of matching for expressions like /ab|cd|ef/ (where
|
||||
string literals separaterd by '|' begin with a character exclusive
|
||||
to each other).
|
||||
|
||||
20190603: version 2.202:
|
||||
* Fixed a bug that caused regex_match to behave like regex_search in
|
||||
the situation where the BMH algorithm is used.
|
||||
|
||||
20190531: version 2.200:
|
||||
* For searching with a ordinary (non-regex) string, added an
|
||||
implementation based on the Boyer-Moore-Horspool algorithm.
|
||||
* Improved UTF-8 iterators.
|
||||
* Fixed behaviours of \b and \B when icase specified, to match /.\B./i
|
||||
against "s\u017F".
|
||||
* Fixed minor issues.
|
||||
|
||||
20190508: version 2.100:
|
||||
* Fixed a bug that caused failure of capturing when 1) a pair of
|
||||
capturing brackets exists in a lookbehind assertion, and 2) variable
|
||||
length expressions exist in both the left side of and the inside of
|
||||
the pair of brackets. E.g., given "1053" =~ /(?<=(\d+)(\d+))$/, no
|
||||
appropriate string was set for $2.
|
||||
* Updated srell_ucfdata.hpp and srell_updata.hpp to support Unicode
|
||||
12.1.0.
|
||||
* Updated unicode/updataout.cpp to support Unicode 12. (Support in
|
||||
advance a new binary property and new script names that will be
|
||||
available in RegExp of ECMAScript 2019 and new script names that are
|
||||
anticipated to be available in RegExp of ECMAScript 2020).
|
||||
* Changed the newline character in srell.hpp from CR+LF to LF.
|
||||
* Modified unicode/*.cpp to output LF as a newline instead of CR+LF.
|
||||
* Updated misc/sample01.cpp to version 1.100:
|
||||
1. Rewrote the chapter numbers in subtitles of compliance tests, in
|
||||
accordance with ECMAScript 2018 Language Specification (ES9).
|
||||
(The old chapter numbers were based on ECMAScript specifications
|
||||
up to version 5.1).
|
||||
2. Added one compliance test from ECMAScript 2018 Language
|
||||
Specification 21.2.2.3, NOTE.
|
||||
* Modified the macros for detecting C++11 features.
|
||||
* Changed the method of the character class.
|
||||
* For all the constructors and assign functions of basic_regex to have
|
||||
a default argument for flag_type, reimplemented syntax_option_type
|
||||
and match_flag_type (missed changes between TR1 -> C++11).
|
||||
* Experimental support for the char8_t type. If a compiler supports
|
||||
char8_t (detected by the __cpp_char8_t macro), classes whose names
|
||||
have the "u8-" prefix accept a sequence of char8_t and handle it as
|
||||
a UTF-8 string. If char8_t is not supported, the classes handle a
|
||||
sequence of char as a UTF-8 string, as before.
|
||||
* As classes that always handle a sequence of char as a UTF-8 string,
|
||||
new classes whose names have the "u8c-" prefix were added. They
|
||||
correspond to the classes having the "u8-" prefix in their names up
|
||||
to version 2.002:
|
||||
* u8cregex; u8ccmatch, u8csmatch; u8ccsub_match, u8cssub_match;
|
||||
u8ccregex_iterator, u8csregex_iterator; u8ccregex_token_iterator,
|
||||
u8csregex_token_iterator.
|
||||
|
||||
20180717: version 2.002:
|
||||
* Changed the maximum number of hexdigits in \u{h...} from six to
|
||||
'unlimited' in accordance with the ECMAScript specification. ("one
|
||||
to six hexadecimal digits" of the old implementation was based on
|
||||
the proposal document).
|
||||
* Updated updataout.cpp to version 1.001. Encounting unknown
|
||||
(newly-encoded) script names is no longer treated as an error.
|
||||
* Updated srell_ucfdata.hpp and srell_updata.hpp to support Unicode
|
||||
11.0.0.
|
||||
|
||||
20180204: version 2.001:
|
||||
* When icase is specified, [\W] (a character class containing \W) no
|
||||
longer matches any of [KkSs\u017F\u212A] (ecma262 issue #512).
|
||||
|
||||
20180127: version 2.000:
|
||||
* Added the following features that are to be included into RegExp of
|
||||
ECMAScript 2018:
|
||||
* New syntax option flag for '.' to match every code point, dotall,
|
||||
was added to srell::regex_constants as a value of
|
||||
syntax_option_type and to srell::basic_regex as a value of
|
||||
flag_type.
|
||||
* New expressions to support the Unicode property, \p{...} and
|
||||
\P{...}.
|
||||
* Named capture groups (?<NAME>...) and the new expression for
|
||||
backreference to a named capture group, \k<NAME>.
|
||||
* The behaviors of lookbehind assertions changed. Now both (?<=...)
|
||||
and (?<!...) support variable-length lookbehind.
|
||||
|
||||
20180125; version 1.401:
|
||||
* Limited the maximum of numbers that are recognised as backreference
|
||||
in match_results.format() up to 99, in accordance with the
|
||||
ECMAScript specification. (I.e., restricted to $1..$9 and $01..$99).
|
||||
* Removed an unused macro and its related code.
|
||||
|
||||
20180101; version 1.400:
|
||||
* Changed the behaviour of the pattern compiler so that an empty
|
||||
non-capturing group can have a quantifier, for example, /(?:)*/. It
|
||||
is a meaningless expression, but changed just for compatibility with
|
||||
RegExp of ECMAScript.
|
||||
* Fixed a hang bug: This occured when 1) a non-capturing group has a
|
||||
quantifier, 2) and the length of the group itself can be zero-width,
|
||||
3) and a backreference that can be zero-width is included in the
|
||||
group somewhere other than the last, such as /(.*)(?:\1.*)*/.
|
||||
|
||||
20171216; version 1.300:
|
||||
* Fixed an important bug: /^(;[^;]*)*$/ did not match ";;;;" because
|
||||
of a bug in optimisation. This problem occured when a sequence of
|
||||
regular expressions ended like /(A...B*)*$/ where a character or
|
||||
character set that A represents and the one that B represents are
|
||||
exclusive to each other.
|
||||
|
||||
20170621; version 1.200:
|
||||
* Updated srell_ucfdata.hpp to support Unicode 10.0.0.
|
||||
* Improved u8regex_traits to handle corrupt UTF-8 sequences more
|
||||
safely.
|
||||
|
||||
20150618; version 1.141:
|
||||
Updated srell_ucfdata.hpp to support Unicode 8.0.0.
|
||||
|
||||
20150517; version 1.140:
|
||||
* Modified the method for regex_match() to determine whether a
|
||||
sequence of regular expressions is matched against a sequence of
|
||||
characters. (Issue raised at #2273 in C++ Standard Library Issues
|
||||
List).
|
||||
* Restricted the accepted range of X in the expression "\cX" to
|
||||
[A-Za-z] in accordance with the ECMAScript specification.
|
||||
* Fixed the problem that caused parens in a lookaround assertion not
|
||||
to capture a sequence correctly in some circumstances because the
|
||||
bug fix done in version 1.111 was imperfect.
|
||||
|
||||
20150503; version 1.130:
|
||||
* Improved case-folding functions.
|
||||
* Updated unicode/ucfdataout.cpp to version 1.100.
|
||||
* Fixed a typo in #if directives for u(16|32)[cs]match.
|
||||
|
||||
20150425; version 1.120:
|
||||
* Fixed the bug that caused characters in U+010000-U+10FFFF in UTF-8
|
||||
(i.e., four octet length characters) not to have been recognised.
|
||||
* Updated misc/sample01.cpp to version 1.010.
|
||||
|
||||
20150402; version 1.111:
|
||||
* Fixed the problem that caused $2 of "aaa" =~ /((.*)*)/ to be empty
|
||||
instead of "aaa" because of a bug in optimisation.
|
||||
|
||||
20141101; version 1.110:
|
||||
* Several fixes based on a bug report:
|
||||
1. Added "this->" to compile() in basic_regex::assign().
|
||||
2. Implemented operator=() functions explicitly instead of using
|
||||
default ones generated automatically.
|
||||
* unicode/ucfdataout.cpp revised and updated to version 1.001.
|
||||
|
||||
20140622; version 1.101:
|
||||
Updated srell_ucfdata.hpp to support Unicode 7.0.0.
|
||||
|
||||
20121118; version 1.100:
|
||||
The first released version.
|
||||
|
||||
421
lib/srell3_009/history_ja.txt
Normal file
421
lib/srell3_009/history_ja.txt
Normal file
|
|
@ -0,0 +1,421 @@
|
|||
20220511; version 3.009:
|
||||
・最適化バグにより /abcd|ab/ が "abc" にマッチしなかった問題を修正。
|
||||
|
||||
20220504; version 3.008:
|
||||
・icase指定時の[^\P{...}]の振る舞いが、TC39で提案中のv-modeのそれに近
|
||||
いものになっていた問題を修正。
|
||||
|
||||
20220429; version 3.007:
|
||||
・カウンタの仕組みをさらに変更。
|
||||
|
||||
20220428; version 3.006:
|
||||
・繰り返し処理用のカウンタを調整。
|
||||
・小さな文字クラス用の線形探索を再削除。
|
||||
|
||||
20220424; version 3.005:
|
||||
・multiline指定時に /(?<=$.*)/ が "a" の終わりにマッチしなかった問題を
|
||||
修正。
|
||||
・TC39で提案中の\A, \z, (?m:)の準備。
|
||||
|
||||
20220420; version 3.004:
|
||||
・'*' または '+' 付きの文字クラスが後続する文字または文字クラスと排他
|
||||
的になっていない表現用の最適化処理を追加。例:/[A-Za-z]+ing/,
|
||||
/".*"/ など。
|
||||
|
||||
20220416; version 3.003:
|
||||
・2つの最適化函数を1つに統合。
|
||||
・先読み (lookahead)・戻り読み (lookbehind) 用のコード量を削減。
|
||||
|
||||
20220416; version 3.002:
|
||||
・3.000で導入した簡易エントリーポイント選択の使用時に、regex_matchや
|
||||
match_continuousフラグが指定されたregex_searchが機能しない場合があっ
|
||||
た問題を修正。
|
||||
|
||||
20211025; version 3.001:
|
||||
・カウンタ分割を廃止。効果がないかむしろ若干速度が低下しているように見
|
||||
えるため。
|
||||
・潜在的なバグを修正。
|
||||
・その他細かな改良など。
|
||||
|
||||
20211023; version 3.000:
|
||||
・srell_ucfdata2.hppとsrell_updata.hppとをUnicode 14.0.0対応に更新。
|
||||
・unicode/updataout.cppをUnicode 14対応に更新(ECMAScript 2022で対応さ
|
||||
れる見込みのスクリプト名の先行対応)。
|
||||
・char32_t未対応のコンパイラでUnicode値を保持するため内部で使用する型
|
||||
を「21ビット以上あるunsigned整数型」から「32ビット以上あるunsigned整
|
||||
数型」に変更。
|
||||
・char32_t未対応のコンパイラで繰り返し回数や文字クラス番号を保持するの
|
||||
に使う型を「unsigned int」から「32ビット以上あるunsigned整数型」に変
|
||||
更。
|
||||
・数値用パーザにoverflowチェックを追加。例:unsigned int型が32ビットの
|
||||
幅の時、前の版まで /a{0,4294967297}/ は /a{0,1}/ 相当になってしまっ
|
||||
ていましたが、前記のチェックを入れたことによりこのような場合には
|
||||
error_braceがthrowされるようになっています。
|
||||
・非multilineモード時に /[^;]*^;?/ が入力文字列の先頭にマッチしなかっ
|
||||
たバグを修正。
|
||||
・ごく簡易なエントリーポイント選択を実装。
|
||||
|
||||
20211004; version 2.930:
|
||||
・WCHAR_MAXの値に基づいてUTF-16/UTF-32対応が切り替わるu1632w-型を新規
|
||||
に追加(WCHAR_MAXが0xFFFF以上・0x10FFFF未満ならu1632w-型はu16w-型の
|
||||
別名となり、WCHAR_MAXが0x10FFFF以上ならu1632w-型はu32w-型の別名とな
|
||||
ります)。
|
||||
・Eytzinger layout検索時に使われるメモリ使用量を削減。
|
||||
・その他細かな改良など(いくつかはNIREに対するMarko Njezic氏の改善案に
|
||||
基づきます)。
|
||||
|
||||
20210624; version 2.920:
|
||||
・?({0,1}相当)用の最適化処理を追加。
|
||||
・misc/sample01.cpp内で参照しているECMAScript仕様書の版を2021に変更。
|
||||
|
||||
20210429; version 2.912:
|
||||
・2.900で導入した最適化処理のバグにより /aa|a|aa/ が "a" にマッチしな
|
||||
くなっていた問題を修正(報告してくださったJan Schrötter氏に感謝しま
|
||||
す)。
|
||||
ちなみにこの最適化処理は、srell.hppをincludeする前に
|
||||
SRELLDBG_NO_BRANCH_OPT2マクロを定義しておくと無効化できます。
|
||||
|
||||
20210424; version 2.911:
|
||||
・2.900で導入した最適化処理内の不用意な行削除が原因で、/abc|ab|ac/ が
|
||||
"ac" に対してマッチしなくなっていた問題を修正(バグ報告に感謝します)。
|
||||
|
||||
20210407; version 2.910:
|
||||
・2.900以降、パターンコンパイラ内部でmove代入演算子が使われる時にメモ
|
||||
リリークしていた問題を修正(報告してくださったMichal Švec氏に感謝し
|
||||
ます)。
|
||||
|
||||
20210214; version 2.901:
|
||||
・不要なテンプレートの特殊化を削除。
|
||||
|
||||
20210214; version 2.900:
|
||||
・文字列のみからなる選択(例:/abc|abd|acde/)用の最適化処理を新規に追
|
||||
加。
|
||||
・u(8|16)[cs]regex_(token_)?iteratorがコンパイルエラーとなり使用できな
|
||||
かった問題を修正。
|
||||
・その他細かな改良など。
|
||||
|
||||
20210131; version 2.810:
|
||||
・UTF-8用内部iteratorの改良。
|
||||
|
||||
20200724; version 2.800:
|
||||
・文字クラスの二分探索にEytzinger layoutを導入。
|
||||
・小さな文字クラス用に線形探索を再実装。
|
||||
・名前付き括弧の名前部分をパーズするためのプロパティーデータの扱いを変
|
||||
更。basic_regex型インスタンス内に読み込むのを止めて、必要な時のみ読
|
||||
み込むように。
|
||||
|
||||
20200714; version 2.730:
|
||||
・入れ子になった捕獲括弧で冗長な退避・復元処理をせぬように変更。
|
||||
・regex_iteratorの改良。
|
||||
|
||||
20200703; version 2.720:
|
||||
・非ASCII文字を含むUTF-8文字列または非BMPの文字を含むUTF-16文字列を、
|
||||
Boyer-Moore-Horspoolアルゴリズムを用いて、大文字小文字の区別無しで
|
||||
(icase/case-insensitiveで) 検索する場合の処理の改良。
|
||||
・Version 2.650での変更により、regex_iterator->prefix().firstが前回マ
|
||||
ッチした位置の終端ではなく文字列全体の最初を指すようにになってしまっ
|
||||
ていたのを修正。
|
||||
・上記修正に合わせて3イテレータ版のregex_search()が呼ばれる場合、
|
||||
match_results.position()は戻り読みの逆行限界として渡された位置
|
||||
(regex_searchの第3引数)を起点とした位置を返し、
|
||||
match_results.prefix().firstは検索開始位置(同第1引数)を指すように
|
||||
変更。
|
||||
・BMH検索時に、不正なUTF-8シークウェンスの前後にある有効なシークウェン
|
||||
スが読み飛ばされてしまう問題を修正(2.630でUTF-8の処理方法を変えた時
|
||||
に混入したバグ)。
|
||||
|
||||
20200701; version 2.710:
|
||||
・Boyer-Moore-Horspool検索の調整。
|
||||
|
||||
20200630; version 2.700:
|
||||
・最適化処理の調整。
|
||||
|
||||
20200620: version 2.651:
|
||||
・グループ名のチェックを行う位置を\uエスケープの解釈後に移動。
|
||||
・misc/sample01.cppをversion 1.103に更新。参照しているECMAScript仕様書
|
||||
の版を2020(ES11)に変更。
|
||||
|
||||
20200618: version 2.650:
|
||||
・名前付き括弧に捕獲された文字列へのアクセス用函数に、グループ名をポイ
|
||||
ンタで指定するoverloadをmatch_resultsに追加。
|
||||
・3イテレータ版のregex_search()使用時には、検索の開始位置ではなく戻り
|
||||
読み (lookbehind) の逆行限界として渡された位置のほうを
|
||||
match_results::prefix::firstにセットするよう変更。
|
||||
・不要と思われる処理をいくつか削除。
|
||||
|
||||
20200601: version 2.643:
|
||||
・syntax_option_typeおよびmatch_flag_typeのoperator函数にinline指定を
|
||||
追加(これがないとリンク時に多重定義エラーが出ることがあるとのご指摘
|
||||
がありました)。
|
||||
・その他細かな改良など。
|
||||
|
||||
20200530: version 2.642:
|
||||
・basic_regex型インスタンスが確保するメモリのサイズを削減。
|
||||
|
||||
20200528: version 2.641:
|
||||
・2.640での修正1が不完全であったため再修正。
|
||||
・最適化処理の調整。
|
||||
|
||||
20200516: version 2.640:
|
||||
・最適化バグの修正1: regex_matchが入力文字列の終端を通り過ぎてしまうこ
|
||||
とがあった問題を修正。
|
||||
・最適化バグの修正2: multilineフラグ指定時に ^ や $ が適切な位置でのマ
|
||||
ッチングをさせてもらえなくなってしまっていた問題を修正。
|
||||
・srell_ucfdata2.hppとsrell_updata.hppとを更新。
|
||||
|
||||
20200509: version 2.630:
|
||||
・正規表現中に不正なUTF-8のシークウェンスがあった場合、パターンコンパ
|
||||
イラがregex_utf8をthrowするように仕様変更(検索対象文字列中に不正な
|
||||
UTF-8の並びがあってもエラー扱いされません)。
|
||||
・UTF-8でBMH検索が行われる際、マッチした箇所の直後に余分な後続
|
||||
(trailing) バイトが続いていた場合にその部分もマッチング結果に含めて
|
||||
しまう問題を修正。
|
||||
・basic_regex.flags() が正しい値を返さないことがあったのを修正。
|
||||
・正規表現中で実際には使われていないグループ名 (NAME) を
|
||||
match_results.format()に渡す書式文字列の中で$<NAME>のようにして指定
|
||||
すると、その部分が空文字に置換されずそのまま残ってしまう問題を修正。
|
||||
|
||||
20200502: version 2.620:
|
||||
・Boyer-Moore-Horspoolアルゴリズム用クラスからmatch_continuous指定時用
|
||||
およびregex_match用の函数を削除。これらの処理時は以前のようにオート
|
||||
マトンを使うように変更。
|
||||
・その他クリーンナップ。
|
||||
|
||||
20200428: version 2.611:
|
||||
・/\d*/ が "abc" の冒頭にマッチせず末尾にマッチする問題を修正(Version
|
||||
2.210で混入したバグ)。
|
||||
|
||||
20200426: version 2.610:
|
||||
・Case-insensitive (icase) なBMH検索が行われる際、探している文字列が検
|
||||
索対象テキスト全体の先頭にあった場合に読み飛ばされてしまうことがある
|
||||
バグを修正(UTF-8またはUTF-16で、検索文字列の末尾が複数のコードユニ
|
||||
ットからなる文字である場合に発生)。
|
||||
・キャプチャグループ名のパーズをECMAScriptの仕様書通りきっちり行うよう
|
||||
に変更。これにより、前の版までは受理されていた /(?<,>...)/ のような
|
||||
グループ名はregex_errorがthrowされるように。
|
||||
|
||||
20200418: version 2.600:
|
||||
・戻り読み (lookbehind) の逆行限界を直接regex_search()に渡せるように
|
||||
3イテレータ版のregex_search()を追加。
|
||||
・[非互換変更] 2.300で導入したmatch_flag_typeのmatch_lblim_availフラグ
|
||||
と、match_resultsのlookbehind_limitメンバとを廃止。
|
||||
・srell_ucfdata2.hppとsrell_updata.hppとをUnicode 13.0.0対応に更新。
|
||||
・unicode/updataout.cppをUnicode 13対応に更新(ECMAScript 2020で対応さ
|
||||
れる見込みのスクリプト名の先行対応)。
|
||||
|
||||
20191118: version 2.500:
|
||||
・初めてbasic_regex型インスタンスが作られた時にcase foldingデータから
|
||||
icaseマッチング用テーブルを展開するのに代えて、最初から計算済みテー
|
||||
ブルを保持しているように仕様変更。
|
||||
・上記変更に併せてsrell_ucfdata.hppおよびそれを出力するucfdataout.cpp
|
||||
はお役御免とし、代わりに展開済みicase用テーブルを保持する
|
||||
srell_ucfdata2.hppとそれを出力するucfdataout2.cppとを追加。
|
||||
・文字クラスの照合方法を線形探索から二分探索に変更。
|
||||
・文字クラスの最適化処理のタイミングを「']' が見つかった時にまとめて一
|
||||
括」から「文字または文字コードの範囲をpushするたびごと逐次」に変更。
|
||||
・assertをすべて削除。
|
||||
・連続する\uHHHHがサロゲートペアをなしている場合はUnicode値として解釈
|
||||
するように変更(これによりECMAScript仕様との相違はなくなりました)。
|
||||
・SRELL_NO_NAMEDCAPTUREマクロ使用時にコンパイルエラーが出ていたのを修
|
||||
正。
|
||||
・updataout.cppを1.101にヴァージョンアップ。
|
||||
・単体版のsrellを追加(single-headerディレクトリ内)。
|
||||
|
||||
20190914: version 2.401:
|
||||
・basic_regex型インスタンスのサイズを削減(Unicode property escapes対
|
||||
応時にうっかり膨張させてしまっていました)。
|
||||
・basic_regex::swap()の改良。
|
||||
|
||||
20190907: version 2.400:
|
||||
・文字クラスの照合速度を改善。
|
||||
・パターンコンパイル時にグループ名中の\uエスケープを解釈するように変更
|
||||
(ECMAScriptの仕様に準拠)。
|
||||
・ucfdataout.cppを1.200にヴァージョンアップ。このプログラムが出力する
|
||||
srell_ucfdata.hpp中のunicode_casefoldingクラスに、新たにメンバ変数が
|
||||
追加されました。
|
||||
SRELL 2.400以降はこの追加されたメンバ変数をコンパイル時に必要とする
|
||||
ため、ucfdataout.cpp 1.101以前によって出力されたsrell_ucfdata.hppを
|
||||
SRELL 2.400以降で使うことはできません(古いSRELLで新しい
|
||||
srell_ucfdata.hppを使うことは可)。
|
||||
・その他コードの整理や改良など。
|
||||
|
||||
20190902: version 2.304:
|
||||
・Version 2.303のコード整理で壊れてしまっていたregex_iteratorを修復。
|
||||
|
||||
20190810: version 2.303:
|
||||
・2.302の修正が不完全であったため再修正。
|
||||
・その他コードの整理。
|
||||
|
||||
20190809: version 2.302:
|
||||
・(?...) に繰り返し指定がついている時、内側の括弧によって捕獲された文
|
||||
字列がループごとにクリアされず持ち越されていたバグを修正。
|
||||
例:/(?:(ab)|(cd))+/.exec("abcd") → 1番括弧はundefinedになるはずが
|
||||
"ab"になってしまっていた。
|
||||
・misc/sample01.cppをversion 1.102に更新。テスト名中の章番号を
|
||||
ECMAScript 2019 (ES10) 準拠に変更
|
||||
|
||||
20190724: version 2.301:
|
||||
・ECMAScriptの仕様に準じて、\でエスケープ可能な文字の種類を次の15字に
|
||||
限定。^$\.*+?()[]{}|/
|
||||
文字クラス内([]内)ではこの15字に加えて '-' も対象に。
|
||||
|
||||
20190717: version 2.300:
|
||||
・検索対象範囲とは別に、戻り読み (lookbehind) の逆行限界を指定できる機
|
||||
能を追加(match_flag_typeへのmatch_lblim_availフラグの追加と
|
||||
match_resultsへのlookbehind_limitメンバの追加)。
|
||||
これに併せてregex_iteratorのコンストラクタ内でも、内部で使うprivate
|
||||
なmatch_results型インスタンスのlookbehind_limitメンバに値を設定する
|
||||
ように変更。
|
||||
・ECMAScriptの仕様に合わせて、後方参照が対応する捕獲括弧より先に出現し
|
||||
てもエラー扱いせぬように変更。/\1(.)/, /(?<=(.)\1)/, /\k<a>(?<a>.)/
|
||||
などすべてOKに。
|
||||
・misc/sample01.cppをversion 1.101に更新。misc.jsより準拠テストを1つ追
|
||||
加。
|
||||
|
||||
20190714: version 2.230:
|
||||
・正規表現が '*' か '+' かを伴う文字または文字クラスで始まる場合の検索
|
||||
速度を改善(例:/[A-Za-z]+ing/)。
|
||||
|
||||
20190707: version 2.221:
|
||||
・std::u8stringの利用可否は__cpp_char8_tではなく__cpp_lib_char8_tを用
|
||||
いて判断するように変更。
|
||||
・icase指定時にcase-folding処理をした結果、文字クラス内の文字がすべて
|
||||
同じ文字になった場合には、文字クラスを解消して文字リテラルとして処理
|
||||
するように変更。例:/r[Ss\u017F]t/i → /rst/i。
|
||||
・その他問題を修正。
|
||||
|
||||
20190617: version 2.220:
|
||||
・カウンタを使わぬほうが内部表現がコンパクトになる繰り返しはカウンタを
|
||||
使わぬように変更。
|
||||
・最適化バグにより、/a{1,2}?b/.exec("aab") が "aab" ではなく "ab" を返
|
||||
していたのを修正(発生条件:最短一致優先の回数指定が付いている文字ま
|
||||
たは文字クラスの後ろに、その文字集合と排他的な文字または文字クラスが
|
||||
続いている場合)。
|
||||
|
||||
20190613: version 2.210:
|
||||
・/ab|cd|ef/ のような表現('|' で区切られている文字列の先頭文字が互い
|
||||
に排他的な場合)の照合方法を改良。
|
||||
|
||||
20190603: version 2.202:
|
||||
・BMHアルゴリズムが使われる状況で、regex_matchがregex_search相当の処理
|
||||
をしてしまうバグを修正。
|
||||
|
||||
20190531: version 2.200:
|
||||
・通常の(正規表現ではない)テキスト検索用に、Boyer-Moore-Horspoolアル
|
||||
ゴリズムに基づく実装を追加。
|
||||
・UTF-8用iteratorの改良。
|
||||
・icase指定時の\b/\Bの挙動を修正。/.\B./i が "s\u017F" にマッチするよ
|
||||
うに。
|
||||
・その他問題を修正。
|
||||
|
||||
20190508: version 2.100:
|
||||
・Lookbehind中に文字列のキャプチャがあり、かつその中および左方に可変長
|
||||
の正規表現があった場合、文字列の捕獲に失敗することがあったのを修正。
|
||||
例:"1053" =~ /(?<=(\d+)(\d+))$/ で$2に適切な文字列がセットされず。
|
||||
・srell_ucfdata.hppとsrell_updata.hppとをUnicode 12.1.0対応に更新。
|
||||
・unicode/updataout.cppをUnicode 12対応に更新(ECMAScript 2020で対応さ
|
||||
れる見込みのスクリプト名の先行対応)。
|
||||
・srell.hpp中の改行コードをCR+LFからLFに変更。
|
||||
・unicode/*.cppが出力するファイルの改行コードをCR+LFからLFに変更。
|
||||
・misc/sample01.cppをversion 1.010に更新。
|
||||
1. テスト名中の章番号をECMAScript 2018 (ES9) 準拠に変更(前版までは
|
||||
ECMAScript 5.1までの章番号準拠でした)。
|
||||
2. ECMAScript 2018規格の2.2.2.3 NOTEから準拠テストを1つ追加。
|
||||
・C++11の機能の使用可否を判定するマクロを変更。
|
||||
・文字クラスの処理方法を変更。
|
||||
・basic_regexの全コンストラクタと全assign函数とでflag_typeのdefault引
|
||||
数を指定できるように、syntax_option_typeとmatch_flag_typeとを再実装
|
||||
(TR1→C++11間の変更の見落とし)。
|
||||
・char8_t型に試験対応。コンパイラがchar8_tに対応している場合
|
||||
(__cpp_char8_tマクロ定義の有無で判断)、"u8-"というprefixの付いた
|
||||
クラスは「char8_t型文字列を受け取り、それをUTF-8として扱う」ように。
|
||||
char8_tに未対応の場合は従来通り、char型文字列をUTF-8として処理。
|
||||
・常に「char型文字列をUTF-8として扱う」クラスとして新規に"u8c-"という
|
||||
prefixに付いたクラスを追加。2.002までの"u8-"付きクラス相当。
|
||||
・u8cregex; u8ccmatch, u8csmatch; u8ccsub_match, u8cssub_match;
|
||||
u8ccregex_iterator, u8csregex_iterator; u8ccregex_token_iterator,
|
||||
u8csregex_token_iterator.
|
||||
|
||||
20180717: version 2.002:
|
||||
・ECMAScriptの仕様に合わせて \u{h...} の h... 部分の最大桁数を6から無
|
||||
制限に変更(変更前の1~6桁というのは提案書に基づく実装でした)。
|
||||
・updataout.cppを1.001に更新。新規に追加されたスクリプト名をエラー扱い
|
||||
せぬように修整。
|
||||
・srell_ucfdata.hppとsrell_updata.hppとをUnicode 11.0.0対応に更新。
|
||||
|
||||
20180204: version 2.001:
|
||||
・icase指定時に、[\W](\Wを含む文字class)が [KkSs\u017F\u212A] のいず
|
||||
れにもマッチせぬよう変更(関連:ecma262 issue #512)。
|
||||
|
||||
20180127; version 2.000:
|
||||
・ECMAScript 2018のRegExpに追加されることになった次の機能を実装:
|
||||
・'.' があらゆるコードポイントにマッチするようにするための指定
|
||||
"dotall" フラグを、srell::regex_constants内の syntax_option_type
|
||||
および srell::basic_regex内の flag_type に追加。
|
||||
・Unicode property用の表現、\p{...} と \P{...} とを追加。
|
||||
・名前付きキャプチャ (?<NAME>...) と、名前付きキャプチャによって捕獲
|
||||
された文字列を後方参照するための正規表現、\k<NAME> とを追加。
|
||||
・戻り読み (lookbehind) の振る舞いを変更。(?<=...), (?<!...) とも可変
|
||||
幅の戻り読みに対応。
|
||||
|
||||
20180125; version 1.401:
|
||||
・ECMAScriptの仕様に合わせて、match_results.format()内で後方参照として
|
||||
認識される数値を99までに制限(即ち$1~$9および$01~$99のみ有効)。
|
||||
・長い間メンテナンスしていないマクロを削除。
|
||||
|
||||
20180101; version 1.400:
|
||||
・/(?:)*/ のように、空のnon-capturingグループにも量指定子を付けられる
|
||||
ように変更(ECMAScriptのRegExpとの互換性確保のための変更で、使い道は
|
||||
おそらくありません)。
|
||||
・次の3条件が揃った時に固まってしまったのを修正: 1) non-capturingグル
|
||||
ープに量指定子が付いていて、2) そのグループ自身が0幅になり得て、3)
|
||||
そのグループ内の最後以外の場所に、0幅になり得る後方参照が現れる時。
|
||||
たとえば /(.*)(?:\1.*)*/ のような表現。
|
||||
|
||||
20171216; version 1.300:
|
||||
・最適化処理のバグにより、/^(;[^;]*)*$/ が ";;;;" にマッチしなかった問
|
||||
題を修正。この問題の発生条件は次の通り:
|
||||
・/(A...B*)*$/ のような終わり方をしていて、かつAとBとが互いに排他的
|
||||
な文字または文字集合である場合。
|
||||
|
||||
20170621; version 1.200:
|
||||
・srell_ucfdata.hppをUnicode 10.0.0対応に。
|
||||
・不正なUTF-8 sequenceに対するu8regex_traitsの振る舞いを改善。
|
||||
|
||||
20150618; version 1.141:
|
||||
srell_ucfdata.hppをUnicode 8.0.0対応に。
|
||||
|
||||
20150517; version 1.140:
|
||||
・regex_match()がマッチの成否を判定する方法の変更。
|
||||
(C++ Standard Library Issues List #2273 への対応)
|
||||
・ECMAScriptの仕様に合わせて \cX の X の範囲を [A-Za-z] に制限。
|
||||
・look-around assertions中の丸括弧が、ある条件下で正しく文字列をキャプ
|
||||
チャせぬ場合があった問題を修正。Version 1.111での修正が不完全であっ
|
||||
たことによるもの。
|
||||
|
||||
20150503; version 1.130:
|
||||
・case-folding用函数の改善。
|
||||
・unicode/ucfdataout.cppをversion 1.100に。
|
||||
・u(16|32)[cs]match用の#if directives中にあったtypoを修正。
|
||||
|
||||
20150425; version 1.120:
|
||||
・UTF-8文字列においてU+010000-U+10FFFFの範囲の文字(4オクテット長の文
|
||||
字)が認識されぬバグを修正。
|
||||
・misc/sample01.cppをversion 1.010に。
|
||||
|
||||
20150402; version 1.111:
|
||||
・最適化処理のバグにより、"aaa" =~ /((.*)*)/ の $2 が "aaa" ではなく空
|
||||
になってしまう問題を修正。
|
||||
|
||||
20141101; version 1.110:
|
||||
・バグ報告による修正:
|
||||
1. basic_regex::assign() 内の compile() に "this->" を追加。
|
||||
2. operator=() 函数を明示的に実装。
|
||||
・unicode/ucfdataout.cppをversion 1.001 に。
|
||||
|
||||
20140622; version 1.101:
|
||||
srell_ucfdata.hppをUnicode 7.0.0対応に。
|
||||
|
||||
20121118; version 1.100:
|
||||
最初のリリース版。
|
||||
|
||||
32
lib/srell3_009/license.txt
Normal file
32
lib/srell3_009/license.txt
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
/*****************************************************************************
|
||||
**
|
||||
** SRELL (std::regex-like library) version 3.009
|
||||
**
|
||||
** Copyright (c) 2012-2022, Nozomu Katoo. All rights reserved.
|
||||
**
|
||||
** Redistribution and use in source and binary forms, with or without
|
||||
** modification, are permitted provided that the following conditions are
|
||||
** met:
|
||||
**
|
||||
** 1. Redistributions of source code must retain the above copyright notice,
|
||||
** this list of conditions and the following disclaimer.
|
||||
**
|
||||
** 2. Redistributions in binary form must reproduce the above copyright
|
||||
** notice, this list of conditions and the following disclaimer in the
|
||||
** documentation and/or other materials provided with the distribution.
|
||||
**
|
||||
** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
|
||||
** IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
||||
** THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
** PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
|
||||
** CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
** EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
** PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
** PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
** LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
** NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
** SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**
|
||||
******************************************************************************
|
||||
**/
|
||||
|
||||
379
lib/srell3_009/misc/sample01.cpp
Normal file
379
lib/srell3_009/misc/sample01.cpp
Normal file
|
|
@ -0,0 +1,379 @@
|
|||
//
|
||||
// A sample program for SRELL (tests and benchmarks).
|
||||
// 2021/06/24; version 1.104
|
||||
//
|
||||
// Macro Options:
|
||||
// -DSTD_REGEX: std::regex used.
|
||||
// -DBOOST_REGEX: boost::regex used.
|
||||
// -DBOOST_XPRESSIVE: boost::xpressive used.
|
||||
// unspecified or others: SRELL used.
|
||||
//
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#if defined(STD_REGEX)
|
||||
#include <regex>
|
||||
#define RE_PREFIX std
|
||||
#pragma message("std::regex selected.")
|
||||
#elif defined(BOOST_REGEX)
|
||||
#include <boost/regex.hpp>
|
||||
#define RE_PREFIX boost
|
||||
#pragma message("boost::regex selected.")
|
||||
#elif defined(BOOST_XPRESSIVE)
|
||||
#include <boost/xpressive/xpressive.hpp>
|
||||
#define RE_PREFIX boost::xpressive
|
||||
#pragma message("boost::xpressive selected.")
|
||||
#else
|
||||
#include "../srell.hpp"
|
||||
#define RE_PREFIX srell
|
||||
#pragma message("srell selected.")
|
||||
#endif
|
||||
|
||||
bool test(const std::string &str, const std::string &exp, const unsigned int max, const std::vector<std::string> *const expected = NULL)
|
||||
{
|
||||
#if !defined(BOOST_XPRESSIVE)
|
||||
RE_PREFIX::regex re;
|
||||
#else
|
||||
boost::xpressive::cregex re;
|
||||
#endif
|
||||
RE_PREFIX::cmatch mr;
|
||||
bool b = false;
|
||||
unsigned int num_of_failures = 0;
|
||||
|
||||
try
|
||||
{
|
||||
std::string matched;
|
||||
std::string msg;
|
||||
|
||||
#if !defined(BOOST_XPRESSIVE)
|
||||
re.assign(exp, RE_PREFIX::regex::ECMAScript);
|
||||
#else
|
||||
re = boost::xpressive::cregex::compile(exp, boost::xpressive::cregex::ECMAScript | boost::xpressive::cregex::not_dot_newline);
|
||||
#endif
|
||||
|
||||
const clock_t st = std::clock();
|
||||
|
||||
for (unsigned int i = 0; i < max; i++)
|
||||
#if !defined(BOOST_REGEX)
|
||||
b = RE_PREFIX::regex_search(str.c_str(), str.c_str() + str.size(), mr, re);
|
||||
#else
|
||||
b = RE_PREFIX::regex_search(str.c_str(), str.c_str() + str.size(), mr, re, boost::regex_constants::match_not_dot_newline);
|
||||
#endif
|
||||
|
||||
const clock_t ed = std::clock();
|
||||
|
||||
// std::fprintf(stdout, "\t\"%s\" =~ /%s/\n", str.c_str(), exp.c_str()); // Perl 5 style.
|
||||
std::fprintf(stdout, "\t/%s/.exec(\"%s\");\n", exp.c_str(), str.c_str()); // ECMAScript style.
|
||||
if (max > 1)
|
||||
std::fprintf(stdout, "\t%u times\n", max);
|
||||
std::fprintf(stdout, "\t%s (%ld msec)\n", b ? "Found" : "Not Found", static_cast<long>(static_cast<double>(ed - st) * 1000 / CLOCKS_PER_SEC));
|
||||
|
||||
for (RE_PREFIX::cmatch::size_type i = 0; i < mr.size(); ++i)
|
||||
{
|
||||
if (i)
|
||||
std::fprintf(stdout, "\t$%u = ", i);
|
||||
else
|
||||
std::fputs("\t$& = ", stdout);
|
||||
if (mr[i].matched)
|
||||
{
|
||||
matched = mr[i].str();
|
||||
msg = '"' + matched + '"' + " (%u+%u)";
|
||||
}
|
||||
else
|
||||
msg = matched = "(undefined)";
|
||||
|
||||
if (expected)
|
||||
{
|
||||
if (i < expected->size())
|
||||
{
|
||||
if (matched == expected->operator[](i))
|
||||
msg += "; passed!";
|
||||
else
|
||||
{
|
||||
msg += "; failed... (expected: \"" + expected->operator[](i) + "\")";
|
||||
++num_of_failures;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
msg += "; failed..."; // should not exist.
|
||||
++num_of_failures;
|
||||
}
|
||||
}
|
||||
msg += '\n';
|
||||
std::fprintf(stdout, msg.c_str(), mr.position(i), mr.length(i));
|
||||
}
|
||||
|
||||
if (!num_of_failures && expected->size() != mr.size())
|
||||
++num_of_failures;
|
||||
|
||||
std::fprintf(stdout, "Result: %s.\n\n", num_of_failures ? "failed" : "passed");
|
||||
return num_of_failures == 0;
|
||||
}
|
||||
catch (const RE_PREFIX::regex_error &e)
|
||||
{
|
||||
std::fprintf(stdout, "Error (regex_error): %d \"%s\"\n\n", e.code(), e.what());
|
||||
}
|
||||
catch (const std::exception &e)
|
||||
{
|
||||
std::fprintf(stdout, "Error (std::exception): \"%s\"\n\n", e.what());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
const unsigned int count = 100000;
|
||||
std::string exp;
|
||||
std::string str;
|
||||
std::vector<std::string> expected;
|
||||
unsigned int num_of_tests = 0;
|
||||
unsigned int num_of_tests_passed = 0;
|
||||
unsigned int num_of_benches = 0;
|
||||
unsigned int num_of_benches_passed = 0;
|
||||
|
||||
std::fputs("Test 1 (ECMAScript 2021 Language Specification 22.2.2.3, NOTE)\n", stdout);
|
||||
str = "abc";
|
||||
exp = "((a)|(ab))((c)|(bc))";
|
||||
expected.resize(7);
|
||||
expected[0] = "abc";
|
||||
expected[1] = "a";
|
||||
expected[2] = "a";
|
||||
expected[3] = "(undefined)";
|
||||
expected[4] = "bc";
|
||||
expected[5] = "(undefined)";
|
||||
expected[6] = "bc";
|
||||
if (test(str, exp, 1, &expected))
|
||||
++num_of_tests_passed;
|
||||
++num_of_tests;
|
||||
|
||||
std::fputs("Test 2a (ECMAScript 2021 Language Specification 22.2.2.5.1, NOTE 2)\n", stdout);
|
||||
str = "abcdefghi";
|
||||
exp = "a[a-z]{2,4}";
|
||||
expected.resize(1);
|
||||
expected[0] = "abcde";
|
||||
if (test(str, exp, 1, &expected))
|
||||
++num_of_tests_passed;
|
||||
++num_of_tests;
|
||||
|
||||
std::fputs("Test 2b (ECMAScript 2021 Language Specification 22.2.2.5.1, NOTE 2)\n", stdout);
|
||||
str = "abcdefghi";
|
||||
exp = "a[a-z]{2,4}?";
|
||||
expected[0] = "abc";
|
||||
if (test(str, exp, 1, &expected))
|
||||
++num_of_tests_passed;
|
||||
++num_of_tests;
|
||||
|
||||
std::fputs("Test 3 (ECMAScript 2021 Language Specification 22.2.2.5.1, NOTE 2)\n", stdout);
|
||||
str = "aabaac";
|
||||
exp = "(aa|aabaac|ba|b|c)*";
|
||||
expected.resize(2);
|
||||
expected[0] = "aaba";
|
||||
expected[1] = "ba";
|
||||
if (test(str, exp, 1, &expected))
|
||||
++num_of_tests_passed;
|
||||
++num_of_tests;
|
||||
|
||||
std::fputs("Test 4 (ECMAScript 2021 Language Specification 22.2.2.5.1, NOTE 3)\n", stdout);
|
||||
str = "zaacbbbcac";
|
||||
exp = "(z)((a+)?(b+)?(c))*";
|
||||
expected.resize(6);
|
||||
expected[0] = "zaacbbbcac";
|
||||
expected[1] = "z";
|
||||
expected[2] = "ac";
|
||||
expected[3] = "a";
|
||||
expected[4] = "(undefined)";
|
||||
expected[5] = "c";
|
||||
if (test(str, exp, 1, &expected))
|
||||
++num_of_tests_passed;
|
||||
++num_of_tests;
|
||||
|
||||
std::fputs("Test 5a (ECMAScript 2021 Language Specification 22.2.2.5.1, NOTE 4)\n", stdout);
|
||||
str = "b";
|
||||
exp = "(a*)*";
|
||||
expected.resize(2);
|
||||
expected[0] = "";
|
||||
expected[1] = "";
|
||||
if (test(str, exp, 1, &expected))
|
||||
++num_of_tests_passed;
|
||||
++num_of_tests;
|
||||
|
||||
std::fputs("Test 5b (ECMAScript 2021 Language Specification 22.2.2.5.1, NOTE 4)\n", stdout);
|
||||
str = "baaaac";
|
||||
exp = "(a*)b\\1+";
|
||||
expected[0] = "b";
|
||||
expected[1] = "";
|
||||
if (test(str, exp, 1, &expected))
|
||||
++num_of_tests_passed;
|
||||
++num_of_tests;
|
||||
|
||||
std::fputs("Test 6a (ECMAScript 2021 Language Specification 22.2.2.8.2, NOTE 2)\n", stdout);
|
||||
str = "baaabac";
|
||||
exp = "(?=(a+))";
|
||||
expected[0] = "";
|
||||
expected[1] = "aaa";
|
||||
if (test(str, exp, 1, &expected))
|
||||
++num_of_tests_passed;
|
||||
++num_of_tests;
|
||||
|
||||
std::fputs("Test 6b (ECMAScript 2021 Language Specification 22.2.2.8.2, NOTE 2)\n", stdout);
|
||||
str = "baaabac";
|
||||
exp = "(?=(a+))a*b\\1";
|
||||
expected[0] = "aba";
|
||||
expected[1] = "a";
|
||||
if (test(str, exp, 1, &expected))
|
||||
++num_of_tests_passed;
|
||||
++num_of_tests;
|
||||
|
||||
std::fputs("Test 7 (ECMAScript 2021 Language Specification 22.2.2.8.2, NOTE 3)\n", stdout);
|
||||
str = "baaabaac";
|
||||
exp = "(.*?)a(?!(a+)b\\2c)\\2(.*)";
|
||||
expected.resize(4);
|
||||
expected[0] = "baaabaac";
|
||||
expected[1] = "ba";
|
||||
expected[2] = "(undefined)";
|
||||
expected[3] = "abaac";
|
||||
if (test(str, exp, 1, &expected))
|
||||
++num_of_tests_passed;
|
||||
++num_of_tests;
|
||||
|
||||
std::fputs("Test 8 (from https://github.com/tc39/test262/tree/master/test/built-ins/RegExp/lookBehind/misc.js)\n", stdout);
|
||||
str = "abc";
|
||||
exp = "(abc\\1)";
|
||||
expected.resize(2);
|
||||
expected[0] = "abc";
|
||||
expected[1] = "abc";
|
||||
if (test(str, exp, 1, &expected))
|
||||
++num_of_tests_passed;
|
||||
++num_of_tests;
|
||||
|
||||
#ifndef SKIP_BENCHMARK
|
||||
|
||||
std::fputs("Benchmark 01\n", stdout);
|
||||
//0123456
|
||||
str = "aaaabaa";
|
||||
exp = "^(.*)*b\\1$";
|
||||
expected.resize(2);
|
||||
expected[0] = "aaaabaa";
|
||||
expected[1] = "aa";
|
||||
if (test(str, exp, count, &expected))
|
||||
++num_of_benches_passed;
|
||||
++num_of_benches;
|
||||
|
||||
std::fputs("Benchmark 02\n", stdout);
|
||||
//012345678
|
||||
str = "aaaabaaaa";
|
||||
exp = "^(.*)*b\\1\\1$";
|
||||
expected[0] = "aaaabaaaa";
|
||||
expected[1] = "aa";
|
||||
if (test(str, exp, count, &expected))
|
||||
++num_of_benches_passed;
|
||||
++num_of_benches;
|
||||
|
||||
std::fputs("Benchmark 03\n", stdout);
|
||||
//01
|
||||
str = "ab";
|
||||
exp = "(.*?)*b\\1";
|
||||
expected[0] = "b";
|
||||
expected[1] = "";
|
||||
if (test(str, exp, count * 10, &expected))
|
||||
++num_of_benches_passed;
|
||||
++num_of_benches;
|
||||
|
||||
std::fputs("Benchmark 04\n", stdout);
|
||||
//01234567
|
||||
str = "acaaabbb";
|
||||
exp = "(a(.)a|\\2(.)b){2}";
|
||||
expected.resize(4);
|
||||
expected[0] = "aaabb";
|
||||
expected[1] = "bb";
|
||||
expected[2] = "(undefined)";
|
||||
expected[3] = "b";
|
||||
if (test(str, exp, count * 10, &expected))
|
||||
++num_of_benches_passed;
|
||||
++num_of_benches;
|
||||
|
||||
std::fputs("Benchmark 05\n", stdout);
|
||||
str = "aabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbaaaaaa";
|
||||
exp = "(a*)(b)*\\1\\1\\1";
|
||||
expected.resize(3);
|
||||
expected[0] = "aabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbaaaaaa";
|
||||
expected[1] = "aa";
|
||||
expected[2] = "b";
|
||||
if (test(str, exp, count, &expected))
|
||||
++num_of_benches_passed;
|
||||
++num_of_benches;
|
||||
|
||||
std::fputs("Benchmark 06a\n", stdout);
|
||||
str = "aaaaaaaaaab";
|
||||
exp = "(.*)*b";
|
||||
expected.resize(2);
|
||||
expected[0] = "aaaaaaaaaab";
|
||||
expected[1] = "aaaaaaaaaa";
|
||||
if (test(str, exp, count * 10, &expected))
|
||||
++num_of_benches_passed;
|
||||
++num_of_benches;
|
||||
|
||||
std::fputs("Benchmark 06b\n", stdout);
|
||||
str = "aaaaaaaaaab";
|
||||
exp = "(.*)+b";
|
||||
if (test(str, exp, count * 10, &expected)) // the same results expected.
|
||||
++num_of_benches_passed;
|
||||
++num_of_benches;
|
||||
|
||||
std::fputs("Benchmark 06c\n", stdout);
|
||||
str = "aaaaaaaaaab";
|
||||
exp = "(.*){2,}b";
|
||||
expected[1] = "";
|
||||
if (test(str, exp, count * 10, &expected))
|
||||
++num_of_benches_passed;
|
||||
++num_of_benches;
|
||||
|
||||
std::fputs("Benchmark 07\n", stdout);
|
||||
str = "aaaaaaaaaabc";
|
||||
exp = "(?=(a+))(abc)";
|
||||
expected.resize(3);
|
||||
expected[0] = "abc";
|
||||
expected[1] = "a";
|
||||
expected[2] = "abc";
|
||||
if (test(str, exp, count, &expected))
|
||||
++num_of_benches_passed;
|
||||
++num_of_benches;
|
||||
|
||||
std::fputs("Benchmark 08\n", stdout);
|
||||
str = "1234-5678-1234-456";
|
||||
exp = "(\\d{4}[-]){3}\\d{3,4}";
|
||||
expected.resize(2);
|
||||
expected[0] = "1234-5678-1234-456";
|
||||
expected[1] = "1234-";
|
||||
if (test(str, exp, count * 5, &expected))
|
||||
++num_of_benches_passed;
|
||||
++num_of_benches;
|
||||
|
||||
std::fputs("Benchmark 09\n", stdout);
|
||||
str = "aaaaaaaaaaaaaaaaaaaaa";
|
||||
exp = "(.*)*b";
|
||||
expected.resize(0);
|
||||
if (test(str, exp, 1, &expected))
|
||||
++num_of_benches_passed;
|
||||
++num_of_benches;
|
||||
|
||||
#endif // !defined(SKIP_BENCHMARK)
|
||||
|
||||
std::fprintf(stdout, "Results of tests: %u/%u passed.\n", num_of_tests_passed, num_of_tests);
|
||||
std::fprintf(stdout, "Results of benchmarks: %u/%u passed.\n", num_of_benches_passed, num_of_benches);
|
||||
|
||||
return 0;
|
||||
|
||||
std::fputs("Benchmark 10\n", stdout);
|
||||
str = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxz";
|
||||
exp = "(x+y*)+a";
|
||||
test(str, exp, 1);
|
||||
|
||||
return 0;
|
||||
}
|
||||
21
lib/srell3_009/readme_en.txt
Normal file
21
lib/srell3_009/readme_en.txt
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
How to Use
|
||||
|
||||
Put the following three files in one directory, and include srell.hpp.
|
||||
1. srell.hpp
|
||||
2. srell_ucfdata2.hpp (data for case folding)
|
||||
3. srell_updata.hpp (data for Unicode properties)
|
||||
|
||||
The files in the following directories are supplements. As SRELL does not use
|
||||
them, it is safe to remove them.
|
||||
|
||||
* misc
|
||||
Contains a source code file for a simple test and benchmark program.
|
||||
|
||||
* single-header
|
||||
Contains a standalone version of srell.hpp into which srell_ucfdata2.hpp
|
||||
and srell_updata.hpp have been merged.
|
||||
|
||||
* unicode
|
||||
Contains source code files for programs that generate srell_ucfdata.hpp and
|
||||
srell_update.hpp from latest Unicode data text files.
|
||||
|
||||
23
lib/srell3_009/readme_ja.txt
Normal file
23
lib/srell3_009/readme_ja.txt
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
■使用法
|
||||
|
||||
次のファイルを同じディレクトリに置き、srell.hppをincludeするだけです。
|
||||
・srell.hpp
|
||||
・srell_ucfdata2.hpp(case folding用データ)
|
||||
・srell_updata.hpp(Unicode property用データ)
|
||||
|
||||
■付属物
|
||||
以下のディレクトリ内にあるものはおまけのようなものです。
|
||||
SRELL側からは参照していませんので、削除してしまってもライブラリの動作に
|
||||
影響はありません。
|
||||
|
||||
・misc
|
||||
簡単なテスト及びベンチマークプログラムのソースが入っています。
|
||||
|
||||
・single-header
|
||||
srell.hppの中にsrell_ucfdata2.hppとsrell_updata.hppとを統合してしまい、
|
||||
これ単体で使用できるようにしたstandalone版が入っています。
|
||||
|
||||
・unicode
|
||||
最新のUnicodeデータからsrell_ucfdata.hpp及びsrell_updata.hppを作るため
|
||||
のプログラムのソースが入っています。
|
||||
|
||||
18361
lib/srell3_009/single-header/srell.hpp
Normal file
18361
lib/srell3_009/single-header/srell.hpp
Normal file
File diff suppressed because it is too large
Load diff
9868
lib/srell3_009/srell.hpp
Normal file
9868
lib/srell3_009/srell.hpp
Normal file
File diff suppressed because it is too large
Load diff
2491
lib/srell3_009/srell_ucfdata2.hpp
Normal file
2491
lib/srell3_009/srell_ucfdata2.hpp
Normal file
File diff suppressed because it is too large
Load diff
6000
lib/srell3_009/srell_updata.hpp
Normal file
6000
lib/srell3_009/srell_updata.hpp
Normal file
File diff suppressed because it is too large
Load diff
76
lib/srell3_009/unicode/readme_en.txt
Normal file
76
lib/srell3_009/unicode/readme_en.txt
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
Contents of this directory:
|
||||
|
||||
1. ucfdataout2.cpp
|
||||
2. updataout.cpp
|
||||
|
||||
----
|
||||
1. ucfdataout2.cpp
|
||||
|
||||
This is a C++ source file for a program that generates a newer version
|
||||
of srell_ucfdata2.hpp, which is a C++ source file that SRELL 2.500- includes
|
||||
for case-folding. It is generated by ucfdataout with CaseFolding.txt provided
|
||||
by the Unicode Consortium.
|
||||
|
||||
+---------------------------------------------------------------------------
|
||||
| What is CaseFolding.txt?
|
||||
|
|
||||
| It is a data file needed for case-insensitive matching based on the
|
||||
| Unicode Standard. Whenever a new version of the Unicode Standard is
|
||||
| released, CaseFolding.txt may also be updated accordingly.
|
||||
|
|
||||
+---------------------------------------------------------------------------
|
||||
|
||||
1-1. Usage
|
||||
|
||||
1) compile ucfdataout2.cpp,
|
||||
2) get the latest version of CaseFolding.txt, which is available at
|
||||
http://www.unicode.org/Public/UNIDATA/CaseFolding.txt ,
|
||||
3) put CaseFolding.txt and a binary file generated at 1) in the same
|
||||
directory and run the binary file,
|
||||
4) move the newly generated "srell_ucfdata2.hpp" to the directory in where
|
||||
SRELL is put.
|
||||
|
||||
1-2. Compatibility
|
||||
|
||||
srell_ucfdata2.hpp is not compatible with srell_ucfdata.hpp that SRELL up
|
||||
to 2.401 was using.
|
||||
|
||||
----
|
||||
2. updataout.cpp
|
||||
|
||||
This is a C++ source file for a program that generates a newer version
|
||||
of srell_updata.hpp, which is a C++ source file that SRELL includes for
|
||||
the Unicode property escapes (\p{...} and \P{...}). It is generated by
|
||||
updataout with the following text files provided by the Unicode Consortium:
|
||||
|
||||
* DerivedCoreProperties.txt
|
||||
* DerivedNormalizationProps.txt
|
||||
* emoji-data.txt
|
||||
* PropList.txt
|
||||
* ScriptExtensions.txt
|
||||
* Scripts.txt
|
||||
* UnicodeData.txt
|
||||
|
||||
As well as CaseFolding.txt mentioned above, these files may be updated
|
||||
accordingly whenever a new version of the Unicode Standard is released.
|
||||
|
||||
2-1. Usage
|
||||
|
||||
1) compile updataout.cpp,
|
||||
2) get the latest versions of the text files mentioned above, which are
|
||||
available at:
|
||||
a. emoji-data.txt: http://www.unicode.org/Public/UNIDATA/emoji/
|
||||
b. others: http://www.unicode.org/Public/UNIDATA/ ,
|
||||
3) put the text files and a binary file generated at 1) in the same
|
||||
directory and run the binary file,
|
||||
4) move the newly generated "srell_updata.hpp" to the directory in where
|
||||
SRELL is put.
|
||||
|
||||
Note: emoji-data.txt has been moved from /Public/UNIDATA/ to
|
||||
/Public/emoji/(version number)/ since Unicode 11.0.0.
|
||||
Since Unicode 13.0.0, moved to /Public/UNIDATA/emoji/ .
|
||||
|
||||
2-2. Compatibility
|
||||
|
||||
srell_updata.hpp does not have compatibility issues as of this release.
|
||||
|
||||
84
lib/srell3_009/unicode/readme_ja.txt
Normal file
84
lib/srell3_009/unicode/readme_ja.txt
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
■同梱物について
|
||||
|
||||
1. ucfdataout2.cpp
|
||||
2. updataout.cpp
|
||||
|
||||
----
|
||||
1. ucfdataout2.cpp
|
||||
|
||||
srell_ucfdata2.hppの最新版を作成するプログラムのソースファイルです。SRELLの
|
||||
2.5以降はcase-insensitiveな(大文字小文字の違いを無視した)照合を行うために、
|
||||
このsrell_ucfdata2.hppを必要とします。
|
||||
|
||||
ucfdataout2は、Unicode Consortiumより提供されているCaseFolding.txtというテキ
|
||||
ストデータからsrell_ucfdata2.hppを自動生成します。
|
||||
|
||||
+---------------------------------------------------------------------------
|
||||
| CaseFolding.txtとは
|
||||
|
|
||||
| Case-insensitiveな照合を行う際には、大文字小文字の違いを吸収するために
|
||||
| "case-folding" と呼ばれる処理が行われます。Unicode規格に基づいた
|
||||
| case-foldingを行うために、Unicode Consortiumから提供されているのが
|
||||
| CaseFolding.txtです。
|
||||
|
|
||||
| このデータファイルはUnicode規格がアップデートされるとそれに合わせて
|
||||
| アップデートされる可能性があります。
|
||||
|
|
||||
+---------------------------------------------------------------------------
|
||||
|
||||
1-1. 使用方法
|
||||
|
||||
1) ucfdataout2.cppをコンパイルします。
|
||||
2) 最新版のCaseFolding.txtを次のURLより取得します。
|
||||
http://www.unicode.org/Public/UNIDATA/CaseFolding.txt ,
|
||||
3) CaseFolding.txtと、1)で作成したバイナリとを同じフォルダに置いて
|
||||
バイナリを実行します。
|
||||
4) srell_ucfdata2.hppが生成されますので、それをSRELLの置かれているディレク
|
||||
トリへと移動させます。
|
||||
|
||||
1-2. 互換性
|
||||
|
||||
srell_ucfdata2.hppは、SRELL 2.401までが利用していたsrell_updata.hppと互換
|
||||
性がありません。
|
||||
|
||||
----
|
||||
2. updataout.cpp
|
||||
|
||||
srell_updata.hppの最新版を作成するプログラムのソースファイルです。SRELLは
|
||||
Unicode property escapes(\p{...} と \P{...})を含む正規表現と文字列との照合
|
||||
を行うために、このsrell_updata.hppを必要とします。
|
||||
|
||||
updataoutは、Unicode Consortiumより提供されている次のテキストデータから
|
||||
srell_updata.hppを自動生成します。
|
||||
|
||||
・DerivedCoreProperties.txt
|
||||
・DerivedNormalizationProps.txt
|
||||
・emoji-data.txt
|
||||
・PropList.txt
|
||||
・ScriptExtensions.txt
|
||||
・Scripts.txt
|
||||
・UnicodeData.txt
|
||||
|
||||
先述のCaseFolding.txt同様、これらのテキストデータファイルもUnicode規格が
|
||||
アップデートされるとそれに合わせてアップデートされる可能性があります。
|
||||
|
||||
2-1. 使用方法
|
||||
|
||||
1) updataout.cppをコンパイルします。
|
||||
2) 前記テキストファイルの最新版を次のURLより取得します。
|
||||
a. emoji-data.txt: http://www.unicode.org/Public/UNIDATA/emoji/
|
||||
b. それ以外: http://www.unicode.org/Public/UNIDATA/
|
||||
3) これらのテキストファイルと、1)で作成したバイナリとを同じフォルダに
|
||||
置いてバイナリを実行します。
|
||||
4) srell_updata.hppが生成されますので、それをSRELLの置かれているディレク
|
||||
トリへと移動させます。
|
||||
|
||||
補註: Unicode 11.0.0以降、emoji-data.txt は /Public/UNIDATA/ から
|
||||
/Public/emoji/(ヴァージョン番号)/ へ移されました。
|
||||
さらに Unicode 13.0.0以降、/Public/UNIDATA/emoji/ へ移されました。
|
||||
|
||||
2-2. 互換性
|
||||
|
||||
srell_updata.hpp には非互換となるような変更はこれまでのところ加えられてい
|
||||
ません。
|
||||
|
||||
590
lib/srell3_009/unicode/ucfdataout2.cpp
Normal file
590
lib/srell3_009/unicode/ucfdataout2.cpp
Normal file
|
|
@ -0,0 +1,590 @@
|
|||
//
|
||||
// ucfdataout.cpp: version 2.100 (2020/05/13).
|
||||
//
|
||||
// This is a program that generates srell_ucfdata.hpp from CaseFolding.txt
|
||||
// provided by the Unicode Consortium. The latese version is available at:
|
||||
// http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
|
||||
//
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include "../srell.hpp"
|
||||
|
||||
#if defined(_MSC_VER) && _MSC_VER >= 1400
|
||||
#pragma warning(disable:4996)
|
||||
#endif
|
||||
|
||||
namespace unishared
|
||||
{
|
||||
template <const std::size_t BufSize, typename Type>
|
||||
std::string stringify(const Type value, const char *const fmt)
|
||||
{
|
||||
char buffer[BufSize];
|
||||
std::sprintf(buffer, fmt, value);
|
||||
return std::string(buffer);
|
||||
}
|
||||
|
||||
bool read_file(std::string &str, const char *const filename, const char *const dir)
|
||||
{
|
||||
const std::string path(std::string(dir ? dir : "") + filename);
|
||||
FILE *const fp = std::fopen(path.c_str(), "r");
|
||||
|
||||
std::fprintf(stdout, "Reading '%s'... ", path.c_str());
|
||||
|
||||
if (fp)
|
||||
{
|
||||
static const std::size_t bufsize = 4096;
|
||||
char *const buffer = static_cast<char *>(std::malloc(bufsize));
|
||||
|
||||
if (buffer)
|
||||
{
|
||||
for (;;)
|
||||
{
|
||||
const std::size_t size = std::fread(buffer, 1, bufsize, fp);
|
||||
|
||||
if (!size)
|
||||
break;
|
||||
|
||||
str.append(buffer, size);
|
||||
}
|
||||
std::fclose(fp);
|
||||
std::fputs("done.\n", stdout);
|
||||
std::free(buffer);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
std::fputs("failed...\n", stdout);
|
||||
return false;
|
||||
}
|
||||
|
||||
bool write_file(const char *const filename, const std::string &str)
|
||||
{
|
||||
FILE *const fp = std::fopen(filename, "wb");
|
||||
|
||||
std::fprintf(stdout, "Writing '%s'... ", filename);
|
||||
|
||||
if (fp)
|
||||
{
|
||||
const bool success = std::fwrite(str.c_str(), 1, str.size(), fp) == str.size();
|
||||
std::fclose(fp);
|
||||
if (success)
|
||||
{
|
||||
std::fputs("done.\n", stdout);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
std::fputs("failed...\n", stdout);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// namespace unishared
|
||||
|
||||
struct ucf_options
|
||||
{
|
||||
const char *infilename;
|
||||
const char *outfilename;
|
||||
const char *indir;
|
||||
int version;
|
||||
int errorno;
|
||||
|
||||
ucf_options(const int argc, const char *const *const argv)
|
||||
: infilename("CaseFolding.txt")
|
||||
, outfilename("srell_ucfdata2.hpp")
|
||||
, indir("")
|
||||
, version(2)
|
||||
, errorno(0)
|
||||
{
|
||||
bool outfile_specified = false;
|
||||
|
||||
for (int index = 1; index < argc; ++index)
|
||||
{
|
||||
const char firstchar = argv[index][0];
|
||||
|
||||
if (firstchar == '-' || firstchar == '/')
|
||||
{
|
||||
const char *const option = argv[index] + 1;
|
||||
|
||||
++index;
|
||||
if (std::strcmp(option, "i") == 0)
|
||||
{
|
||||
if (index >= argc)
|
||||
goto NO_ARGUMENT;
|
||||
infilename = argv[index];
|
||||
}
|
||||
else if (std::strcmp(option, "o") == 0)
|
||||
{
|
||||
if (index >= argc)
|
||||
goto NO_ARGUMENT;
|
||||
outfilename = argv[index];
|
||||
outfile_specified = true;
|
||||
}
|
||||
else if (std::strcmp(option, "v") == 0)
|
||||
{
|
||||
if (index >= argc)
|
||||
goto NO_ARGUMENT;
|
||||
version = static_cast<int>(std::strtol(argv[index], NULL, 10));
|
||||
if (!outfile_specified && version < 2)
|
||||
{
|
||||
static const char *const v1name = "srell_ucfdata.hpp";
|
||||
outfilename = v1name;
|
||||
}
|
||||
}
|
||||
else if (std::strcmp(option, "id") == 0)
|
||||
{
|
||||
if (index >= argc)
|
||||
goto NO_ARGUMENT;
|
||||
indir = argv[index];
|
||||
}
|
||||
else
|
||||
{
|
||||
--index;
|
||||
goto UNKNOWN_OPTION;
|
||||
}
|
||||
|
||||
continue;
|
||||
|
||||
NO_ARGUMENT:
|
||||
std::fprintf(stdout, "[Error] no argument for \"%s\" specified.\n", argv[--index]);
|
||||
errorno = -2;
|
||||
}
|
||||
else
|
||||
{
|
||||
UNKNOWN_OPTION:
|
||||
std::fprintf(stdout, "[Error] unknown option \"%s\" found.\n", argv[index]);
|
||||
errorno = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
// struct ucf_options
|
||||
|
||||
class unicode_casefolding
|
||||
{
|
||||
public:
|
||||
|
||||
unicode_casefolding()
|
||||
: maxdelta_(0L), maxdelta_cp_(0L), ucf_maxcodepoint_(0L), rev_maxcodepoint_(0L)
|
||||
, ucf_numofsegs_(1U), rev_numofsegs_(1U), numofcps_from_(0U), numofcps_to_(0U)
|
||||
, max_appearance_(0U), nextoffset_(0x100L), rev_charsets_(1, -1L)
|
||||
{
|
||||
}
|
||||
|
||||
int create_ucfdata(std::string &outdata, const ucf_options &opts)
|
||||
{
|
||||
const std::string indent("\t\t\t");
|
||||
int errorno = opts.errorno;
|
||||
std::string buf;
|
||||
|
||||
if (errorno)
|
||||
return errorno;
|
||||
|
||||
if (unishared::read_file(buf, opts.infilename, opts.indir))
|
||||
{
|
||||
static const srell::regex re_line("^.*$", srell::regex::multiline);
|
||||
const srell::cregex_iterator eos;
|
||||
srell::cregex_iterator iter(buf.c_str(), buf.c_str() + buf.size(), re_line);
|
||||
srell::cmatch match;
|
||||
int colcount = 0;
|
||||
|
||||
for (; iter != eos; ++iter)
|
||||
{
|
||||
if (iter->length(0))
|
||||
{
|
||||
static const srell::regex re_datainfo("^# (.*)$");
|
||||
|
||||
if (!srell::regex_match((*iter)[0].first, (*iter)[0].second, match, re_datainfo))
|
||||
{
|
||||
outdata.append(1, '\n');
|
||||
break;
|
||||
}
|
||||
outdata += "// " + match.str(1) + "\n";
|
||||
}
|
||||
}
|
||||
|
||||
if (opts.version <= 1)
|
||||
outdata += "template <typename T1, typename T2, typename T3>\nstruct unicode_casefolding\n{\n\tstatic const T1 *table()\n\t{\n\t\tstatic const T1 ucftable[] =\n\t\t{\n";
|
||||
else
|
||||
outdata += "template <typename T2, typename T3>\nstruct unicode_casefolding\n{\n";
|
||||
|
||||
for (; iter != eos; ++iter)
|
||||
{
|
||||
static const srell::regex re_cfdata("^\\s*([0-9A-Fa-f]+); ([CS]); ([0-9A-Fa-f]+);\\s*#\\s*(.*)$");
|
||||
const srell::cmatch &line = *iter;
|
||||
|
||||
if (srell::regex_match(line[0].first, line[0].second, match, re_cfdata))
|
||||
{
|
||||
const std::string from(match[1]);
|
||||
const std::string to(match[3]);
|
||||
const std::string type(match[2]);
|
||||
const std::string name(match[4]);
|
||||
|
||||
update(from, to);
|
||||
|
||||
if (opts.version == 1)
|
||||
outdata += indent + "{ 0x" + from + ", 0x" + to + " },\t// " + type + "; " + name + "\n";
|
||||
else if (opts.version <= 0)
|
||||
{
|
||||
if (colcount == 0)
|
||||
outdata += indent;
|
||||
outdata += "{ 0x" + from + ", 0x" + to + " },";
|
||||
if (++colcount == 4)
|
||||
{
|
||||
outdata.append(1, '\n');
|
||||
colcount = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (opts.version == 1)
|
||||
{
|
||||
static const srell::regex re_comment_or_emptyline("^#.*|^$");
|
||||
|
||||
if (!srell::regex_match(line[0].first, line[0].second, re_comment_or_emptyline))
|
||||
outdata += indent + "// " + line.str(0) + "\n";
|
||||
}
|
||||
}
|
||||
if (colcount > 0)
|
||||
outdata.append(1, '\n');
|
||||
if (opts.version <= 1)
|
||||
outdata += indent + "{ 0, 0 }\n\t\t};\n\t\treturn ucftable;\n\t}\n";
|
||||
|
||||
outdata += "\tstatic const T2 ucf_maxcodepoint = 0x" + unishared::stringify<16>(ucf_maxcodepoint_, "%.4lX") + ";\n";
|
||||
outdata += "\tstatic const T3 ucf_deltatablesize = 0x" + unishared::stringify<16>(ucf_numofsegs_ << 8, "%X") + ";\n";
|
||||
|
||||
outdata += "\tstatic const T2 rev_maxcodepoint = 0x" + unishared::stringify<16>(rev_maxcodepoint_, "%.4lX") + ";\n";
|
||||
outdata += "\tstatic const T3 rev_indextablesize = 0x" + unishared::stringify<16>(rev_numofsegs_ << 8, "%X") + ";\n";
|
||||
outdata += "\tstatic const T3 rev_charsettablesize = " + unishared::stringify<16>(numofcps_to_ * 2 + numofcps_from_ + 1, "%u") + ";\t// 1 + " + unishared::stringify<16>(numofcps_to_, "%u") + " * 2 + " + unishared::stringify<16>(numofcps_from_, "%u") + "\n";
|
||||
outdata += "\tstatic const T3 rev_maxset = " + unishared::stringify<16>(maxset(), "%u") + ";\n";
|
||||
outdata += "\tstatic const T2 eos = 0;\n";
|
||||
|
||||
if (opts.version >= 2)
|
||||
{
|
||||
outdata += "\n\tstatic const T2 ucf_deltatable[];\n\tstatic const T3 ucf_segmenttable[];\n\tstatic const T3 rev_indextable[];\n\tstatic const T3 rev_segmenttable[];\n\tstatic const T2 rev_charsettable[];\n\n\tstatic const T2 *ucf_deltatable_ptr()\n\t{\n\t\treturn ucf_deltatable;\n\t}\n\tstatic const T3 *ucf_segmenttable_ptr()\n\t{\n\t\treturn ucf_segmenttable;\n\t}\n\tstatic const T3 *rev_indextable_ptr()\n\t{\n\t\treturn rev_indextable;\n\t}\n\tstatic const T3 *rev_segmenttable_ptr()\n\t{\n\t\treturn rev_segmenttable;\n\t}\n\tstatic const T2 *rev_charsettable_ptr()\n\t{\n\t\treturn rev_charsettable;\n\t}\n};\n\n";
|
||||
out_v2tables(outdata);
|
||||
outdata += "#define SRELL_UCFDATA_VERSION 200\n";
|
||||
}
|
||||
else
|
||||
outdata += "};\n#define SRELL_UCFDATA_VER 201909L\n";
|
||||
|
||||
std::fprintf(stdout, "MaxDelta: %+ld (U+%.4lX->U+%.4lX)\n", maxdelta_, maxdelta_cp_, maxdelta_cp_ + maxdelta_);
|
||||
}
|
||||
else
|
||||
errorno = 1;
|
||||
|
||||
return errorno;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
void update(const std::string &from, const std::string &to)
|
||||
{
|
||||
const long cp_from = std::strtol(from.c_str(), NULL, 16);
|
||||
const long cp_to = std::strtol(to.c_str(), NULL, 16);
|
||||
const long delta = cp_to - cp_from;
|
||||
const long segno_from = cp_from >> 8;
|
||||
const long segno_to = cp_to >> 8;
|
||||
|
||||
update_tables(cp_from, cp_to, segno_from);
|
||||
|
||||
++numofcps_from_;
|
||||
if (std::abs(maxdelta_) < std::abs(delta))
|
||||
{
|
||||
maxdelta_cp_ = cp_from;
|
||||
maxdelta_ = delta;
|
||||
}
|
||||
|
||||
if (ucf_maxcodepoint_ < cp_from)
|
||||
ucf_maxcodepoint_ = cp_from;
|
||||
|
||||
if (rev_maxcodepoint_ < cp_to)
|
||||
rev_maxcodepoint_ = cp_to;
|
||||
|
||||
if (rev_maxcodepoint_ < cp_from)
|
||||
rev_maxcodepoint_ = cp_from;
|
||||
|
||||
if (!ucf_countedsegnos.count(segno_from))
|
||||
{
|
||||
ucf_countedsegnos[segno_from] = 1;
|
||||
++ucf_numofsegs_;
|
||||
}
|
||||
|
||||
if (!rev_countedsegnos.count(segno_to))
|
||||
{
|
||||
rev_countedsegnos[segno_to] = 1;
|
||||
++rev_numofsegs_;
|
||||
}
|
||||
if (!rev_countedsegnos.count(segno_from))
|
||||
{
|
||||
rev_countedsegnos[segno_from] = 1;
|
||||
++rev_numofsegs_;
|
||||
}
|
||||
|
||||
if (!cps_counted_as_foldedto.count(cp_to))
|
||||
{
|
||||
cps_counted_as_foldedto[cp_to] = 1;
|
||||
++numofcps_to_;
|
||||
}
|
||||
|
||||
if (appearance_counts_.count(to))
|
||||
++appearance_counts_[to];
|
||||
else
|
||||
appearance_counts_[to] = 1;
|
||||
|
||||
if (max_appearance_ < appearance_counts_[to])
|
||||
max_appearance_ = appearance_counts_[to];
|
||||
}
|
||||
|
||||
unsigned int maxset() const
|
||||
{
|
||||
return max_appearance_ + 1;
|
||||
}
|
||||
|
||||
void out_v2tables(std::string &outdata)
|
||||
{
|
||||
const char *const headers[] = {
|
||||
"template <typename T2, typename T3>\nconst ",
|
||||
" unicode_casefolding<T2, T3>::",
|
||||
"[] =\n{\n"
|
||||
};
|
||||
|
||||
create_revtables();
|
||||
out_lowertable(outdata, headers, "T2", "ucf_deltatable", ucf_deltas_, ucf_segments_);
|
||||
outdata.append(1, '\n');
|
||||
out_uppertable(outdata, headers, "T3", "ucf_segmenttable", ucf_segments_);
|
||||
outdata.append(1, '\n');
|
||||
out_lowertable(outdata, headers, "T3", "rev_indextable", rev_indices_, rev_segments_);
|
||||
outdata.append(1, '\n');
|
||||
out_uppertable(outdata, headers, "T3", "rev_segmenttable", rev_segments_);
|
||||
outdata.append(1, '\n');
|
||||
out_cstable(outdata, headers, "T2", "rev_charsettable", rev_charsets_);
|
||||
}
|
||||
|
||||
// Updates ucf_segments_, ucf_deltas_, and rev_charsets_.
|
||||
void update_tables(const long cp_from, const long cp_to, const long segno_from)
|
||||
{
|
||||
if (segno_from >= static_cast<long>(ucf_segments_.size()))
|
||||
ucf_segments_.resize(segno_from + 1, 0L);
|
||||
|
||||
long &offset_of_segment = ucf_segments_[segno_from];
|
||||
|
||||
if (offset_of_segment == 0L)
|
||||
{
|
||||
offset_of_segment = nextoffset_;
|
||||
nextoffset_ += 0x100L;
|
||||
ucf_deltas_.resize(nextoffset_, 0L);
|
||||
}
|
||||
|
||||
ucf_deltas_[offset_of_segment + (cp_from & 0xffL)] = cp_to - cp_from;
|
||||
|
||||
for (long index = 0L;; ++index)
|
||||
{
|
||||
if (index == static_cast<long>(rev_charsets_.size()))
|
||||
{
|
||||
rev_charsets_.push_back(cp_to);
|
||||
rev_charsets_.push_back(cp_from);
|
||||
rev_charsets_.push_back(-1L);
|
||||
break;
|
||||
}
|
||||
if (rev_charsets_[index] == cp_to)
|
||||
{
|
||||
for (++index; rev_charsets_[index] != -1L; ++index);
|
||||
|
||||
rev_charsets_.insert(index, 1, cp_from);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Creates rev_segments_ and rev_indices_ from rev_charsets_.
|
||||
void create_revtables()
|
||||
{
|
||||
long nextoffset = 0x100L;
|
||||
for (long index = 0L; index < static_cast<long>(rev_charsets_.size()); ++index)
|
||||
{
|
||||
const long bocs = index; // Beginning of charset.
|
||||
|
||||
for (; rev_charsets_[index] != -1L; ++index)
|
||||
{
|
||||
const long &u21ch = rev_charsets_[index];
|
||||
const long segno = u21ch >> 8L;
|
||||
|
||||
if (segno >= static_cast<long>(rev_segments_.size()))
|
||||
rev_segments_.resize(segno + 1, 0L);
|
||||
|
||||
long &offset_of_segment = rev_segments_[segno];
|
||||
|
||||
if (offset_of_segment == 0L)
|
||||
{
|
||||
offset_of_segment = nextoffset;
|
||||
nextoffset += 0x100L;
|
||||
rev_indices_.resize(nextoffset, 0L);
|
||||
}
|
||||
rev_indices_[offset_of_segment + (u21ch & 0xffL)] = bocs;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void out_lowertable(std::string &outdata, const char *const headers[], const char *const type, const char *const funcname, const std::basic_string<long> &table, const std::basic_string<long> &segtable) const
|
||||
{
|
||||
int end = static_cast<int>(table.size());
|
||||
|
||||
outdata += headers[0];
|
||||
outdata += type;
|
||||
outdata += headers[1];
|
||||
outdata += funcname;
|
||||
outdata += headers[2];
|
||||
|
||||
for (int i = 0; i < end;)
|
||||
{
|
||||
const int col = i & 15;
|
||||
|
||||
if ((i & 255) == 0)
|
||||
{
|
||||
if (i)
|
||||
{
|
||||
for (int j = 0; j < static_cast<int>(segtable.size()); ++j)
|
||||
{
|
||||
if (segtable[j] == i)
|
||||
{
|
||||
outdata += "\n\t// For u+" + unishared::stringify<16>(j, "%.2X") + "xx (" + unishared::stringify<16>(i, "%d") + ")\n";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
outdata += "\t// For common (0)\n";
|
||||
}
|
||||
|
||||
outdata += col == 0 ? "\t" : (col & 3) == 0 ? " " : " ";
|
||||
if (table[i] >= 0L)
|
||||
outdata += unishared::stringify<16>(table[i], "%ld");
|
||||
else
|
||||
outdata += "static_cast<", outdata += type, outdata += ">(", outdata += unishared::stringify<16>(table[i], "%ld") + ")";
|
||||
|
||||
if (++i == end)
|
||||
outdata.append(1, '\n');
|
||||
else if (col == 15)
|
||||
outdata += ",\n";
|
||||
else
|
||||
outdata.append(1, ',');
|
||||
}
|
||||
outdata += "};\n";
|
||||
}
|
||||
|
||||
void out_uppertable(std::string &outdata, const char *const headers[], const char *const type, const char *const funcname, const std::basic_string<long> &table) const
|
||||
{
|
||||
int end = static_cast<int>(table.size());
|
||||
|
||||
outdata += headers[0];
|
||||
outdata += type;
|
||||
outdata += headers[1];
|
||||
outdata += funcname;
|
||||
outdata += headers[2];
|
||||
|
||||
for (int i = 0; i < end;)
|
||||
{
|
||||
const int col = i & 15;
|
||||
|
||||
outdata += col == 0 ? "\t" : (col & 3) == 0 ? " " : " ";
|
||||
if (table[i] >= 0)
|
||||
outdata += unishared::stringify<16>(table[i], "%ld");
|
||||
else
|
||||
outdata += "static_cast<", outdata += type, outdata += ">(", outdata += unishared::stringify<16>(table[i], "%ld") + ")";
|
||||
|
||||
if (++i == end)
|
||||
outdata.append(1, '\n');
|
||||
else if (col == 15)
|
||||
outdata += ",\n";
|
||||
else
|
||||
outdata.append(1, ',');
|
||||
}
|
||||
outdata += "};\n";
|
||||
}
|
||||
|
||||
void out_cstable(std::string &outdata, const char *const headers[], const char *const type, const char *const funcname, const std::basic_string<long> &table) const
|
||||
{
|
||||
int end = static_cast<int>(table.size());
|
||||
bool newline = true;
|
||||
int bos = 0;
|
||||
int prevprintedbos = -1;
|
||||
|
||||
outdata += headers[0];
|
||||
outdata += type;
|
||||
outdata += headers[1];
|
||||
outdata += funcname;
|
||||
outdata += headers[2];
|
||||
|
||||
for (int i = 0; i < end;)
|
||||
{
|
||||
const long val = table[i];
|
||||
|
||||
outdata += newline ? "\t" : " ";
|
||||
newline = false;
|
||||
|
||||
if (val == -1L)
|
||||
outdata += "eos";
|
||||
else
|
||||
outdata += "0x", outdata += unishared::stringify<16>(val, "%.4lX");
|
||||
|
||||
if (++i != end)
|
||||
outdata.append(1, ',');
|
||||
|
||||
if (val == -1L)
|
||||
{
|
||||
if (prevprintedbos != bos / 10 || i == end)
|
||||
{
|
||||
outdata += "\t// ";
|
||||
outdata += unishared::stringify<16>(bos, "%d");
|
||||
prevprintedbos = bos / 10;
|
||||
}
|
||||
outdata.append(1, '\n');
|
||||
newline = true;
|
||||
bos = i;
|
||||
}
|
||||
}
|
||||
outdata += "};\n";
|
||||
}
|
||||
|
||||
typedef std::map<long, char> flagset_type;
|
||||
|
||||
long maxdelta_; // = 0L;
|
||||
long maxdelta_cp_; // = 0L;
|
||||
long ucf_maxcodepoint_; // = 0L; // The max code point for case-folding.
|
||||
long rev_maxcodepoint_; // = 0L; // The max code point for reverse lookup.
|
||||
unsigned int ucf_numofsegs_; // = 1U; // The number of segments in the delta table.
|
||||
unsigned int rev_numofsegs_; // = 1U; // The number of segments in the table for reverse lookup.
|
||||
unsigned int numofcps_from_; // = 0U; // The number of code points in "folded from"s.
|
||||
unsigned int numofcps_to_; // = 0U; // The number of code points in "folded to"s.
|
||||
|
||||
flagset_type ucf_countedsegnos; // The set of segment nos marked as "counted" for case-folding.
|
||||
flagset_type rev_countedsegnos; // The set of segment nos marked as "counted" for reverse lookup.
|
||||
flagset_type cps_counted_as_foldedto; // The set of code points marked as "folded to".
|
||||
|
||||
unsigned int max_appearance_;
|
||||
std::map<std::string, unsigned int> appearance_counts_;
|
||||
|
||||
long nextoffset_;
|
||||
std::basic_string<long> ucf_deltas_;
|
||||
std::basic_string<long> ucf_segments_;
|
||||
std::basic_string<long> rev_indices_;
|
||||
std::basic_string<long> rev_segments_;
|
||||
std::basic_string<long> rev_deltas_;
|
||||
std::basic_string<long> rev_charsets_;
|
||||
};
|
||||
// class unicode_casefolding
|
||||
|
||||
int main(const int argc, const char *const *const argv)
|
||||
{
|
||||
ucf_options ucfopts(argc, argv);
|
||||
std::string outdata;
|
||||
unicode_casefolding ucf;
|
||||
int errorno = ucf.create_ucfdata(outdata, ucfopts);
|
||||
|
||||
if (errorno == 0)
|
||||
{
|
||||
if (!unishared::write_file(ucfopts.outfilename, outdata))
|
||||
errorno = 2;
|
||||
}
|
||||
return errorno;
|
||||
}
|
||||
1066
lib/srell3_009/unicode/updataout.cpp
Normal file
1066
lib/srell3_009/unicode/updataout.cpp
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue