"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "test-framework/src/java/org/apache/lucene/analysis/standard/generateJavaUnicodeWordBreakTest.pl" between
lucene-7.6.0-src.tgz and lucene-7.7.0-src.tgz

About: Lucene is a Java full-text search engine (not a complete application, but rather a code library and API; java source code).

generateJavaUnicodeWordBreakTest.pl  (lucene-7.6.0-src.tgz):generateJavaUnicodeWordBreakTest.pl  (lucene-7.7.0-src.tgz)
skipping to change at line 43 skipping to change at line 43
my $url_prefix = "http://www.unicode.org/Public/${version}/ucd"; my $url_prefix = "http://www.unicode.org/Public/${version}/ucd";
my $scripts_url = "${url_prefix}/Scripts.txt"; my $scripts_url = "${url_prefix}/Scripts.txt";
my $line_break_url = "${url_prefix}/LineBreak.txt"; my $line_break_url = "${url_prefix}/LineBreak.txt";
my $word_break_url = "${url_prefix}/auxiliary/WordBreakProperty.txt"; my $word_break_url = "${url_prefix}/auxiliary/WordBreakProperty.txt";
my $word_break_test_url = "${url_prefix}/auxiliary/WordBreakTest.txt"; my $word_break_test_url = "${url_prefix}/auxiliary/WordBreakTest.txt";
my $underscore_version = $version; my $underscore_version = $version;
$underscore_version =~ s/\./_/g; $underscore_version =~ s/\./_/g;
my $class_name = "WordBreakTestUnicode_${underscore_version}"; my $class_name = "WordBreakTestUnicode_${underscore_version}";
my $output_filename = "${class_name}.java"; my $output_filename = "${class_name}.java";
my $header =<<"__HEADER__"; my $header =<<"__HEADER__";
package org.apache.lucene.analysis;
/* /*
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. * this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0 * The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with * (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at * the License. You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.lucene.analysis.standard;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.junit.Ignore; import org.junit.Ignore;
/** /**
* This class was automatically generated by ${script_name} * This class was automatically generated by ${script_name}
* from: ${url_prefix}/auxiliary/WordBreakTest.txt * from: ${url_prefix}/auxiliary/WordBreakTest.txt
* *
* WordBreakTest.txt indicates the points in the provided character sequences * WordBreakTest.txt indicates the points in the provided character sequences
* at which conforming implementations must and must not break words. This * at which conforming implementations must and must not break words. This
skipping to change at line 84 skipping to change at line 84
* sequences bounded by word breaks and containing at least one character * sequences bounded by word breaks and containing at least one character
* from one of the following character sets: * from one of the following character sets:
* *
* \\p{Script = Han} (From $scripts_url) * \\p{Script = Han} (From $scripts_url)
* \\p{Script = Hiragana} * \\p{Script = Hiragana}
* \\p{LineBreak = Complex_Context} (From $line_break_url) * \\p{LineBreak = Complex_Context} (From $line_break_url)
* \\p{WordBreak = ALetter} (From $word_break_url) * \\p{WordBreak = ALetter} (From $word_break_url)
* \\p{WordBreak = Hebrew_Letter} * \\p{WordBreak = Hebrew_Letter}
* \\p{WordBreak = Katakana} * \\p{WordBreak = Katakana}
* \\p{WordBreak = Numeric} (Excludes full-width Arabic digits) * \\p{WordBreak = Numeric} (Excludes full-width Arabic digits)
* [\\uFF10-\\uFF19] (Full-width Arabic digits) * [\\uFF10-\\uFF19] (Full-width Arabic digits)
*/ */
\@Ignore \@Ignore
public class ${class_name} extends BaseTokenStreamTestCase { public class ${class_name} extends BaseTokenStreamTestCase {
public void test(Analyzer analyzer) throws Exception { public void test(Analyzer analyzer) throws Exception {
__HEADER__ __HEADER__
my $codepoints = []; my $codepoints = [];
map { $codepoints->[$_] = 1 } (0xFF10..0xFF19); map { $codepoints->[$_] = 1 } (0xFF10..0xFF19);
my $regional_indicator_codepoints = [];
# Complex_Context is an alias for 'SA', which is used in LineBreak.txt # Complex_Context is an alias for 'SA', which is used in LineBreak.txt
# Using lowercase versions of property value names to allow for case- # Using lowercase versions of property value names to allow for case-
# insensitive comparison with the names in the Unicode data files. # insensitive comparison with the names in the Unicode data files.
parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1}); parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1});
parse_Unicode_data_file($scripts_url, $codepoints, parse_Unicode_data_file($scripts_url, $codepoints,
{'han' => 1, 'hiragana' => 1}); {'han' => 1, 'hiragana' => 1});
parse_Unicode_data_file($word_break_url, $codepoints, parse_Unicode_data_file($word_break_url, $codepoints,
{'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, {'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1,
'numeric' => 1}); 'numeric' => 1, 'e_base' => 1,
'e_modifier' => 1, 'glue_after_zwj' => 1, 'e_base_gaz'
=> 1});
parse_Unicode_data_file($word_break_url, $regional_indicator_codepoints, {'regio
nal_indicator' => 1});
my @tests = split /\r?\n/, get_URL_content($word_break_test_url); my @tests = split /\r?\n/, get_URL_content($word_break_test_url);
my $output_path = File::Spec->catpath($volume, $directory, $output_filename); my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
open OUT, ">$output_path" open OUT, ">$output_path"
|| die "Error opening '$output_path' for writing: $!"; || die "Error opening '$output_path' for writing: $!";
print STDERR "Writing '$output_path'..."; print STDERR "Writing '$output_path'...";
print OUT $header; print OUT $header;
skipping to change at line 127 skipping to change at line 130
print OUT " // $line\n"; print OUT " // $line\n";
$sequence =~ s/\s*÷\s*$//; # Trim trailing break character $sequence =~ s/\s*÷\s*$//; # Trim trailing break character
my $test_string = $sequence; my $test_string = $sequence;
$test_string =~ s/\s*÷\s*/\\u/g; $test_string =~ s/\s*÷\s*/\\u/g;
$test_string =~ s/\s*×\s*/\\u/g; $test_string =~ s/\s*×\s*/\\u/g;
$test_string =~ s/\\u([0-9A-F]{5,})/join('', map { "\\u$_" } above_BMP_char_to _surrogates($1))/ge; $test_string =~ s/\\u([0-9A-F]{5,})/join('', map { "\\u$_" } above_BMP_char_to _surrogates($1))/ge;
$test_string =~ s/\\u000A/\\n/g; $test_string =~ s/\\u000A/\\n/g;
$test_string =~ s/\\u000D/\\r/g; $test_string =~ s/\\u000D/\\r/g;
$test_string =~ s/\\u0022/\\\"/g; $test_string =~ s/\\u0022/\\\"/g;
$sequence =~ s/^\s*÷\s*//; # Trim leading break character $sequence =~ s/^\s*÷\s*//; # Trim leading break character
# TODO: When upgrading JFlex to a version that supports Unicode 11.0+: remove
the special case below for a Unicode 9.0 test data line that conflicts with TR#5
1 11.0 test data
# ÷ 200D ÷ 261D ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) ÷ [999.0] WHITE UP PO
INTING INDEX (E_Base) ÷ [0.3]
if ($sequence =~ /^200D\s*÷\s*261D$/) {
print OUT " // Skipping this test because it conflicts with TR#51 v11.0 r
ules.\n\n";
next;
}
my @tokens = (); my @tokens = ();
my $isfirst = 0;
for my $candidate (split /\s*÷\s*/, $sequence) { for my $candidate (split /\s*÷\s*/, $sequence) {
$isfirst = 1;
my @chars = (); my @chars = ();
my $has_wanted_char = 0; my $has_wanted_chars = 0;
my $prev_char_regional_indicator = 0;
while ($candidate =~ /([0-9A-F]+)/gi) { while ($candidate =~ /([0-9A-F]+)/gi) {
my $hexchar = $1; my $hexchar = $1;
if (4 == length($hexchar)) { if (4 == length($hexchar)) {
push @chars, $hexchar; push @chars, $hexchar;
} else { } else {
push @chars, above_BMP_char_to_surrogates($hexchar); push @chars, above_BMP_char_to_surrogates($hexchar);
} }
unless ($has_wanted_char) { unless ($has_wanted_chars) {
$has_wanted_char = 1 if (defined($codepoints->[hex($hexchar)])); my $codepoint = hex($hexchar);
if (defined($codepoints->[$codepoint])) {
$has_wanted_chars = 1;
} elsif (defined($regional_indicator_codepoints->[$codepoint])) {
if (1 == $prev_char_regional_indicator) {
$has_wanted_chars = 1; # must be 2 regional indicators in a row
} else {
$prev_char_regional_indicator = 1;
}
}
} }
} }
if ($has_wanted_char) { if ($has_wanted_chars) {
push @tokens, '"'.join('', map { "\\u$_" } @chars).'"'; push @tokens, '"'.join('', map { $_ eq "0022" ? "\\\"" : "\\u$_" } @chars)
.'"';
} }
} }
print OUT " assertAnalyzesTo(analyzer, \"${test_string}\",\n"; print OUT " assertAnalyzesTo(analyzer, \"${test_string}\",\n";
print OUT " new String[] { "; print OUT " new String[] { ";
print OUT join(", ", @tokens), " });\n\n"; print OUT join(", ", @tokens), " });\n\n";
} }
print OUT " }\n}\n"; print OUT " }\n}\n";
close OUT; close OUT;
print STDERR "done.\n"; print STDERR "done.\n";
 End of changes. 11 change blocks. 
10 lines changed or deleted 40 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)