"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "core/src/java/org/apache/lucene/util/bkd/BKDWriter.java" between
lucene-7.6.0-src.tgz and lucene-7.7.0-src.tgz

About: Lucene is a Java full-text search engine (not a complete application, but rather a code library and API; java source code).

BKDWriter.java  (lucene-7.6.0-src.tgz):BKDWriter.java  (lucene-7.7.0-src.tgz)
skipping to change at line 683 skipping to change at line 683
valueCount += leafCount; valueCount += leafCount;
if (leafBlockFPs.size() > 0) { if (leafBlockFPs.size() > 0) {
// Save the first (minimum) value in each leaf block except the first, t o build the split value index in the end: // Save the first (minimum) value in each leaf block except the first, t o build the split value index in the end:
leafBlockStartValues.add(ArrayUtil.copyOfSubArray(leafValues, 0, packedB ytesLength)); leafBlockStartValues.add(ArrayUtil.copyOfSubArray(leafValues, 0, packedB ytesLength));
} }
leafBlockFPs.add(out.getFilePointer()); leafBlockFPs.add(out.getFilePointer());
checkMaxLeafNodeCount(leafBlockFPs.size()); checkMaxLeafNodeCount(leafBlockFPs.size());
// Find per-dim common prefix: // Find per-dim common prefix:
int prefix = bytesPerDim;
int offset = (leafCount - 1) * packedBytesLength; int offset = (leafCount - 1) * packedBytesLength;
for(int j=0;j<bytesPerDim;j++) { int prefix = FutureArrays.mismatch(leafValues, 0, bytesPerDim, leafValues,
if (leafValues[j] != leafValues[offset+j]) { offset, offset + bytesPerDim);
prefix = j; if (prefix == -1) {
break; prefix = bytesPerDim;
}
} }
commonPrefixLengths[0] = prefix; commonPrefixLengths[0] = prefix;
assert scratchOut.getPosition() == 0; assert scratchOut.getPosition() == 0;
writeLeafBlockDocs(scratchOut, leafDocs, 0, leafCount); writeLeafBlockDocs(scratchOut, leafDocs, 0, leafCount);
writeCommonPrefixes(scratchOut, commonPrefixLengths, leafValues); writeCommonPrefixes(scratchOut, commonPrefixLengths, leafValues);
scratchBytesRef1.length = packedBytesLength; scratchBytesRef1.length = packedBytesLength;
scratchBytesRef1.bytes = leafValues; scratchBytesRef1.bytes = leafValues;
skipping to change at line 770 skipping to change at line 767
} }
} else { } else {
assert count == 0; assert count == 0;
} }
} }
// TODO: if we fixed each partition step to just record the file offset at the "split point", we could probably handle variable length // TODO: if we fixed each partition step to just record the file offset at the "split point", we could probably handle variable length
// encoding and not have our own ByteSequencesReader/Writer // encoding and not have our own ByteSequencesReader/Writer
/** Sort the heap writer by the specified dim */ /** Sort the heap writer by the specified dim */
private void sortHeapPointWriter(final HeapPointWriter writer, int dim) { private void sortHeapPointWriter(final HeapPointWriter writer, int pointCount,
final int pointCount = Math.toIntExact(this.pointCount); int dim) {
// Tie-break by docID: // Tie-break by docID:
// No need to tie break on ord, for the case where the same doc has the same value in a given dimension indexed more than once: it // No need to tie break on ord, for the case where the same doc has the same value in a given dimension indexed more than once: it
// can't matter at search time since we don't write ords into the index: // can't matter at search time since we don't write ords into the index:
new MSBRadixSorter(bytesPerDim + Integer.BYTES) { new MSBRadixSorter(bytesPerDim + Integer.BYTES) {
@Override @Override
protected int byteAt(int i, int k) { protected int byteAt(int i, int k) {
assert k >= 0; assert k >= 0;
if (k < bytesPerDim) { if (k < bytesPerDim) {
skipping to change at line 845 skipping to change at line 841
System.out.println(" " + count + ": " + new BytesRef(v, dim*bytesPe rDim, bytesPerDim)); System.out.println(" " + count + ": " + new BytesRef(v, dim*bytesPe rDim, bytesPerDim));
count++; count++;
if (count == slice.count) { if (count == slice.count) {
break; break;
} }
} }
} }
} }
*/ */
//return a new point writer sort by the provided dimension from input data
private PointWriter sort(int dim) throws IOException { private PointWriter sort(int dim) throws IOException {
assert dim >= 0 && dim < numDataDims; assert dim >= 0 && dim < numDataDims;
if (heapPointWriter != null) { if (heapPointWriter != null) {
assert tempInput == null; assert tempInput == null;
// We never spilled the incoming points to disk, so now we sort in heap: // We never spilled the incoming points to disk, so now we sort in heap:
HeapPointWriter sorted; HeapPointWriter sorted = heapPointWriter;
if (dim == 0) {
// First dim can re-use the current heap writer
sorted = heapPointWriter;
} else {
// Subsequent dims need a private copy
sorted = new HeapPointWriter((int) pointCount, (int) pointCount, packedB
ytesLength, longOrds, singleValuePerDoc);
sorted.copyFrom(heapPointWriter);
}
//long t0 = System.nanoTime(); //long t0 = System.nanoTime();
sortHeapPointWriter(sorted, dim); sortHeapPointWriter(sorted, Math.toIntExact(this.pointCount), dim);
//long t1 = System.nanoTime(); //long t1 = System.nanoTime();
//System.out.println("BKD: sort took " + ((t1-t0)/1000000.0) + " msec"); //System.out.println("BKD: sort took " + ((t1-t0)/1000000.0) + " msec");
sorted.close(); sorted.close();
heapPointWriter = null;
return sorted; return sorted;
} else { } else {
// Offline sort: // Offline sort:
assert tempInput != null; assert tempInput != null;
OfflinePointWriter sorted = sortOffLine(dim, tempInput.getName(), 0, point
Count);
tempDir.deleteFile(tempInput.getName());
tempInput = null;
return sorted;
}
}
final int offset = bytesPerDim * dim; //return a new point writer sort by the provided dimension from start to start
+ pointCount
private PointWriter sort(int dim, PointWriter writer, final long start, final
long pointCount) throws IOException {
assert dim >= 0 && dim < numDataDims;
Comparator<BytesRef> cmp; if (writer instanceof HeapPointWriter) {
if (dim == numDataDims - 1) { HeapPointWriter heapPointWriter = createHeapPointWriterCopy((HeapPointWrit
// in that case the bytes for the dimension and for the doc id are conti er) writer, start, pointCount);
guous, sortHeapPointWriter(heapPointWriter, Math.toIntExact(pointCount), dim);
// so we don't need a branch return heapPointWriter;
cmp = new BytesRefComparator(bytesPerDim + Integer.BYTES) { } else {
@Override OfflinePointWriter offlinePointWriter = (OfflinePointWriter) writer;
protected int byteAt(BytesRef ref, int i) { return sortOffLine(dim, offlinePointWriter.name, start, pointCount);
}
}
// sort a given file on a given dimension for start to start + point count
private OfflinePointWriter sortOffLine(int dim, String inputName, final long s
tart, final long pointCount) throws IOException {
final int offset = bytesPerDim * dim;
Comparator<BytesRef> cmp;
if (dim == numDataDims - 1) {
// in that case the bytes for the dimension and for the doc id are contigu
ous,
// so we don't need a branch
cmp = new BytesRefComparator(bytesPerDim + Integer.BYTES) {
@Override
protected int byteAt(BytesRef ref, int i) {
return ref.bytes[ref.offset + offset + i] & 0xff;
}
};
} else {
cmp = new BytesRefComparator(bytesPerDim + Integer.BYTES) {
@Override
protected int byteAt(BytesRef ref, int i) {
if (i < bytesPerDim) {
return ref.bytes[ref.offset + offset + i] & 0xff; return ref.bytes[ref.offset + offset + i] & 0xff;
} else {
return ref.bytes[ref.offset + packedBytesLength + i - bytesPerDim] &
0xff;
} }
}; }
} else { };
cmp = new BytesRefComparator(bytesPerDim + Integer.BYTES) { }
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd
" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles, bytesPerDoc, nul
l, 0) {
/**
* We write/read fixed-byte-width file that {@link OfflinePointReader} can
read.
*/
@Override
protected ByteSequencesWriter getWriter(IndexOutput out, long count) {
return new ByteSequencesWriter(out) {
@Override @Override
protected int byteAt(BytesRef ref, int i) { public void write(byte[] bytes, int off, int len) throws IOException {
if (i < bytesPerDim) { assert len == bytesPerDoc : "len=" + len + " bytesPerDoc=" + bytesPe
return ref.bytes[ref.offset + offset + i] & 0xff; rDoc;
} else { out.writeBytes(bytes, off, len);
return ref.bytes[ref.offset + packedBytesLength + i - bytesPerDim]
& 0xff;
}
} }
}; };
} }
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_b /**
kd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles, bytesPerDoc, n * We write/read fixed-byte-width file that {@link OfflinePointReader} can
ull, 0) { read.
*/
/** We write/read fixed-byte-width file that {@link OfflinePointReader @Override
} can read. */ protected ByteSequencesReader getReader(ChecksumIndexInput in, String name
@Override ) throws IOException {
protected ByteSequencesWriter getWriter(IndexOutput out, long count) { //This allows to read only a subset of the original file
return new ByteSequencesWriter(out) { long startPointer = (name.equals(inputName)) ? bytesPerDoc * start : in.
@Override getFilePointer();
public void write(byte[] bytes, int off, int len) throws IOExcepti long endPointer = (name.equals(inputName)) ? startPointer + bytesPerDoc
on { * pointCount : Long.MAX_VALUE;
assert len == bytesPerDoc: "len=" + len + " bytesPerDoc=" + byte in.seek(startPointer);
sPerDoc; return new ByteSequencesReader(in, name) {
out.writeBytes(bytes, off, len); final BytesRef scratch = new BytesRef(new byte[bytesPerDoc]);
}
};
}
/** We write/read fixed-byte-width file that {@link OfflinePointReader } can read. */
@Override @Override
protected ByteSequencesReader getReader(ChecksumIndexInput in, String public BytesRef next() throws IOException {
name) throws IOException { if (in.getFilePointer() >= end) {
return new ByteSequencesReader(in, name) { return null;
final BytesRef scratch = new BytesRef(new byte[bytesPerDoc]); } else if (in.getFilePointer() >= endPointer) {
@Override in.seek(end);
public BytesRef next() throws IOException { return null;
if (in.getFilePointer() >= end) { }
return null; in.readBytes(scratch.bytes, 0, bytesPerDoc);
} return scratch;
in.readBytes(scratch.bytes, 0, bytesPerDoc);
return scratch;
}
};
} }
}; };
}
};
String name = sorter.sort(tempInput.getName()); String name = sorter.sort(inputName);
return new OfflinePointWriter(tempDir, name, packedBytesLength, pointCount,
longOrds, singleValuePerDoc);
}
return new OfflinePointWriter(tempDir, name, packedBytesLength, pointCount private HeapPointWriter createHeapPointWriterCopy(HeapPointWriter writer, long
, longOrds, singleValuePerDoc); start, long count) throws IOException {
//TODO: Can we do this faster?
int size = Math.toIntExact(count);
try (HeapPointWriter pointWriter = new HeapPointWriter(size, size, packedByt
esLength, longOrds, singleValuePerDoc);
PointReader reader = writer.getReader(start, count)) {
for (long i =0; i < count; i++) {
reader.next();
pointWriter.append(reader.packedValue(), reader.ord(), reader.docID());
}
return pointWriter;
} catch (Throwable t) {
throw verifyChecksum(t, writer);
} }
} }
private void checkMaxLeafNodeCount(int numLeaves) { private void checkMaxLeafNodeCount(int numLeaves) {
if ((1+bytesPerDim) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) { if ((1+bytesPerDim) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH) {
throw new IllegalStateException("too many nodes; increase maxPointsInLeafN ode (currently " + maxPointsInLeafNode + ") and reindex"); throw new IllegalStateException("too many nodes; increase maxPointsInLeafN ode (currently " + maxPointsInLeafNode + ") and reindex");
} }
} }
/** Writes the BKD tree to the provided {@link IndexOutput} and returns the fi le offset where index was written. */ /** Writes the BKD tree to the provided {@link IndexOutput} and returns the fi le offset where index was written. */
skipping to change at line 964 skipping to change at line 995
if (offlinePointWriter != null) { if (offlinePointWriter != null) {
offlinePointWriter.close(); offlinePointWriter.close();
} }
if (pointCount == 0) { if (pointCount == 0) {
throw new IllegalStateException("must index at least one point"); throw new IllegalStateException("must index at least one point");
} }
LongBitSet ordBitSet; LongBitSet ordBitSet;
if (numDataDims > 1) { if (numIndexDims > 1) {
if (singleValuePerDoc) { if (singleValuePerDoc) {
ordBitSet = new LongBitSet(maxDoc); ordBitSet = new LongBitSet(maxDoc);
} else { } else {
ordBitSet = new LongBitSet(pointCount); ordBitSet = new LongBitSet(pointCount);
} }
} else { } else {
ordBitSet = null; ordBitSet = null;
} }
long countPerLeaf = pointCount; long countPerLeaf = pointCount;
skipping to change at line 998 skipping to change at line 1029
// Indexed by nodeID, but first (root) nodeID is 1. We do 1+ because the le ad byte at each recursion says which dim we split on. // Indexed by nodeID, but first (root) nodeID is 1. We do 1+ because the le ad byte at each recursion says which dim we split on.
byte[] splitPackedValues = new byte[Math.toIntExact(numLeaves*(1+bytesPerDim ))]; byte[] splitPackedValues = new byte[Math.toIntExact(numLeaves*(1+bytesPerDim ))];
// +1 because leaf count is power of 2 (e.g. 8), and innerNodeCount is power of 2 minus 1 (e.g. 7) // +1 because leaf count is power of 2 (e.g. 8), and innerNodeCount is power of 2 minus 1 (e.g. 7)
long[] leafBlockFPs = new long[numLeaves]; long[] leafBlockFPs = new long[numLeaves];
// Make sure the math above "worked": // Make sure the math above "worked":
assert pointCount / numLeaves <= maxPointsInLeafNode: "pointCount=" + pointC ount + " numLeaves=" + numLeaves + " maxPointsInLeafNode=" + maxPointsInLeafNode ; assert pointCount / numLeaves <= maxPointsInLeafNode: "pointCount=" + pointC ount + " numLeaves=" + numLeaves + " maxPointsInLeafNode=" + maxPointsInLeafNode ;
// Sort all docs once by each dimension: // Slices are created as they are needed
PathSlice[] sortedPointWriters = new PathSlice[numDataDims]; PathSlice[] sortedPointWriters = new PathSlice[numIndexDims];
// This is only used on exception; on normal code paths we close all files w e opened: // This is only used on exception; on normal code paths we close all files w e opened:
List<Closeable> toCloseHeroically = new ArrayList<>(); List<Closeable> toCloseHeroically = new ArrayList<>();
boolean success = false; boolean success = false;
try { try {
//long t0 = System.nanoTime();
// even with selective indexing we create the sortedPointWriters so we can
compress
// the leaf node data by common prefix
for(int dim=0;dim<numDataDims;dim++) {
sortedPointWriters[dim] = new PathSlice(sort(dim), 0, pointCount);
}
//long t1 = System.nanoTime();
//System.out.println("sort time: " + ((t1-t0)/1000000.0) + " msec");
if (tempInput != null) {
tempDir.deleteFile(tempInput.getName());
tempInput = null;
} else {
assert heapPointWriter != null;
heapPointWriter = null;
}
final int[] parentSplits = new int[numIndexDims]; final int[] parentSplits = new int[numIndexDims];
build(1, numLeaves, sortedPointWriters, build(1, numLeaves, sortedPointWriters,
ordBitSet, out, ordBitSet, out,
minPackedValue, maxPackedValue, minPackedValue, maxPackedValue,
parentSplits, parentSplits,
splitPackedValues, splitPackedValues,
leafBlockFPs, leafBlockFPs,
toCloseHeroically); toCloseHeroically);
assert Arrays.equals(parentSplits, new int[numIndexDims]); assert Arrays.equals(parentSplits, new int[numIndexDims]);
for(PathSlice slice : sortedPointWriters) {
slice.writer.destroy();
}
// If no exception, we should have cleaned everything up: // If no exception, we should have cleaned everything up:
assert tempDir.getCreatedFiles().isEmpty(); assert tempDir.getCreatedFiles().isEmpty();
//long t2 = System.nanoTime(); //long t2 = System.nanoTime();
//System.out.println("write time: " + ((t2-t1)/1000000.0) + " msec"); //System.out.println("write time: " + ((t2-t1)/1000000.0) + " msec");
success = true; success = true;
} finally { } finally {
if (success == false) { if (success == false) {
IOUtils.deleteFilesIgnoringExceptions(tempDir, tempDir.getCreatedFiles() ); IOUtils.deleteFilesIgnoringExceptions(tempDir, tempDir.getCreatedFiles() );
IOUtils.closeWhileHandlingException(toCloseHeroically); IOUtils.closeWhileHandlingException(toCloseHeroically);
skipping to change at line 1159 skipping to change at line 1170
// The left tree's left most leaf block FP is always the minimal FP: // The left tree's left most leaf block FP is always the minimal FP:
leftBlockFP = minBlockFP; leftBlockFP = minBlockFP;
} }
int address = nodeID * (1+bytesPerDim); int address = nodeID * (1+bytesPerDim);
int splitDim = splitPackedValues[address++] & 0xff; int splitDim = splitPackedValues[address++] & 0xff;
//System.out.println("recursePack inner nodeID=" + nodeID + " splitDim=" + splitDim + " splitValue=" + new BytesRef(splitPackedValues, address, bytesPerDi m)); //System.out.println("recursePack inner nodeID=" + nodeID + " splitDim=" + splitDim + " splitValue=" + new BytesRef(splitPackedValues, address, bytesPerDi m));
// find common prefix with last split value in this dim: // find common prefix with last split value in this dim:
int prefix = 0; int prefix = FutureArrays.mismatch(splitPackedValues, address, address + b
for(;prefix<bytesPerDim;prefix++) { ytesPerDim, lastSplitValues,
if (splitPackedValues[address+prefix] != lastSplitValues[splitDim * byte splitDim * bytesPerDim, splitDim * bytesPerDim + bytesPerDim);
sPerDim + prefix]) { if (prefix == -1) {
break; prefix = bytesPerDim;
}
} }
//System.out.println("writeNodeData nodeID=" + nodeID + " splitDim=" + spl itDim + " numDims=" + numDims + " bytesPerDim=" + bytesPerDim + " prefix=" + pre fix); //System.out.println("writeNodeData nodeID=" + nodeID + " splitDim=" + spl itDim + " numDims=" + numDims + " bytesPerDim=" + bytesPerDim + " prefix=" + pre fix);
int firstDiffByteDelta; int firstDiffByteDelta;
if (prefix < bytesPerDim) { if (prefix < bytesPerDim) {
//System.out.println(" delta byte cur=" + Integer.toHexString(splitPack edValues[address+prefix]&0xFF) + " prev=" + Integer.toHexString(lastSplitValues[ splitDim * bytesPerDim + prefix]&0xFF) + " negated?=" + negativeDeltas[splitDim] ); //System.out.println(" delta byte cur=" + Integer.toHexString(splitPack edValues[address+prefix]&0xFF) + " prev=" + Integer.toHexString(lastSplitValues[ splitDim * bytesPerDim + prefix]&0xFF) + " negated?=" + negativeDeltas[splitDim] );
firstDiffByteDelta = (splitPackedValues[address+prefix]&0xFF) - (lastSpl itValues[splitDim * bytesPerDim + prefix]&0xFF); firstDiffByteDelta = (splitPackedValues[address+prefix]&0xFF) - (lastSpl itValues[splitDim * bytesPerDim + prefix]&0xFF);
if (negativeDeltas[splitDim]) { if (negativeDeltas[splitDim]) {
firstDiffByteDelta = -firstDiffByteDelta; firstDiffByteDelta = -firstDiffByteDelta;
skipping to change at line 1450 skipping to change at line 1460
// Now we mark ords that fall into the right half, so we can partition on al l other dims that are not the split dim: // Now we mark ords that fall into the right half, so we can partition on al l other dims that are not the split dim:
// Read the split value, then mark all ords in the right tree (larger than t he split value): // Read the split value, then mark all ords in the right tree (larger than t he split value):
// TODO: find a way to also checksum this reader? If we changed to markLeft Tree, and scanned the final chunk, it could work? // TODO: find a way to also checksum this reader? If we changed to markLeft Tree, and scanned the final chunk, it could work?
try (PointReader reader = source.writer.getReader(source.start + source.coun t - rightCount, rightCount)) { try (PointReader reader = source.writer.getReader(source.start + source.coun t - rightCount, rightCount)) {
boolean result = reader.next(); boolean result = reader.next();
assert result: "rightCount=" + rightCount + " source.count=" + source.coun t + " source.writer=" + source.writer; assert result: "rightCount=" + rightCount + " source.count=" + source.coun t + " source.writer=" + source.writer;
System.arraycopy(reader.packedValue(), splitDim*bytesPerDim, scratch1, 0, bytesPerDim); System.arraycopy(reader.packedValue(), splitDim*bytesPerDim, scratch1, 0, bytesPerDim);
if (numDataDims > 1) { if (numIndexDims > 1 && ordBitSet != null) {
assert ordBitSet.get(reader.ord()) == false; assert ordBitSet.get(reader.ord()) == false;
ordBitSet.set(reader.ord()); ordBitSet.set(reader.ord());
// Subtract 1 from rightCount because we already did the first value abo ve (so we could record the split value): // Subtract 1 from rightCount because we already did the first value abo ve (so we could record the split value):
reader.markOrds(rightCount-1, ordBitSet); reader.markOrds(rightCount-1, ordBitSet);
} }
} catch (Throwable t) { } catch (Throwable t) {
throw verifyChecksum(t, source.writer); throw verifyChecksum(t, source.writer);
} }
return scratch1; return scratch1;
skipping to change at line 1555 skipping to change at line 1565
final int count = to - from; final int count = to - from;
assert count <= maxPointsInLeafNode; assert count <= maxPointsInLeafNode;
// Compute common prefixes // Compute common prefixes
Arrays.fill(commonPrefixLengths, bytesPerDim); Arrays.fill(commonPrefixLengths, bytesPerDim);
reader.getValue(from, scratchBytesRef1); reader.getValue(from, scratchBytesRef1);
for (int i = from + 1; i < to; ++i) { for (int i = from + 1; i < to; ++i) {
reader.getValue(i, scratchBytesRef2); reader.getValue(i, scratchBytesRef2);
for (int dim=0;dim<numDataDims;dim++) { for (int dim=0;dim<numDataDims;dim++) {
final int offset = dim * bytesPerDim; final int offset = dim * bytesPerDim;
for(int j=0;j<commonPrefixLengths[dim];j++) { int dimensionPrefixLength = commonPrefixLengths[dim];
if (scratchBytesRef1.bytes[scratchBytesRef1.offset+offset+j] != scra commonPrefixLengths[dim] = FutureArrays.mismatch(scratchBytesRef1.byte
tchBytesRef2.bytes[scratchBytesRef2.offset+offset+j]) { s, scratchBytesRef1.offset + offset,
commonPrefixLengths[dim] = j; scratchBytesRef1.offset + offset + dimensionPrefixLength,
break; scratchBytesRef2.bytes, scratchBytesRef2.offset + offset,
} scratchBytesRef2.offset + offset + dimensionPrefixLength);
if (commonPrefixLengths[dim] == -1) {
commonPrefixLengths[dim] = dimensionPrefixLength;
} }
} }
} }
// Find the dimension that has the least number of unique bytes at commonP refixLengths[dim] // Find the dimension that has the least number of unique bytes at commonP refixLengths[dim]
FixedBitSet[] usedBytes = new FixedBitSet[numDataDims]; FixedBitSet[] usedBytes = new FixedBitSet[numDataDims];
for (int dim = 0; dim < numDataDims; ++dim) { for (int dim = 0; dim < numDataDims; ++dim) {
if (commonPrefixLengths[dim] < bytesPerDim) { if (commonPrefixLengths[dim] < bytesPerDim) {
usedBytes[dim] = new FixedBitSet(256); usedBytes[dim] = new FixedBitSet(256);
} }
skipping to change at line 1635 skipping to change at line 1647
out.writeBytes(scratchOut.getBytes(), 0, scratchOut.getPosition()); out.writeBytes(scratchOut.getBytes(), 0, scratchOut.getPosition());
scratchOut.reset(); scratchOut.reset();
} else { } else {
// inner node // inner node
// compute the split dimension and partition around it // compute the split dimension and partition around it
final int splitDim = split(minPackedValue, maxPackedValue, parentSplits); final int splitDim = split(minPackedValue, maxPackedValue, parentSplits);
final int mid = (from + to + 1) >>> 1; final int mid = (from + to + 1) >>> 1;
int commonPrefixLen = bytesPerDim; int commonPrefixLen = FutureArrays.mismatch(minPackedValue, splitDim * byt
for (int i = 0; i < bytesPerDim; ++i) { esPerDim,
if (minPackedValue[splitDim * bytesPerDim + i] != maxPackedValue[splitDi splitDim * bytesPerDim + bytesPerDim, maxPackedValue, splitDim * bytes
m * bytesPerDim + i]) { PerDim,
commonPrefixLen = i; splitDim * bytesPerDim + bytesPerDim);
break; if (commonPrefixLen == -1) {
} commonPrefixLen = bytesPerDim;
} }
MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPr efixLen, MutablePointsReaderUtils.partition(maxDoc, splitDim, bytesPerDim, commonPr efixLen,
reader, from, to, mid, scratchBytesRef1, scratchBytesRef2); reader, from, to, mid, scratchBytesRef1, scratchBytesRef2);
// set the split value // set the split value
final int address = nodeID * (1+bytesPerDim); final int address = nodeID * (1+bytesPerDim);
splitPackedValues[address] = (byte) splitDim; splitPackedValues[address] = (byte) splitDim;
reader.getValue(mid, scratchBytesRef1); reader.getValue(mid, scratchBytesRef1);
System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDi m * bytesPerDim, splitPackedValues, address + 1, bytesPerDim); System.arraycopy(scratchBytesRef1.bytes, scratchBytesRef1.offset + splitDi m * bytesPerDim, splitPackedValues, address + 1, bytesPerDim);
skipping to change at line 1683 skipping to change at line 1694
private void build(int nodeID, int leafNodeOffset, private void build(int nodeID, int leafNodeOffset,
PathSlice[] slices, PathSlice[] slices,
LongBitSet ordBitSet, LongBitSet ordBitSet,
IndexOutput out, IndexOutput out,
byte[] minPackedValue, byte[] maxPackedValue, byte[] minPackedValue, byte[] maxPackedValue,
int[] parentSplits, int[] parentSplits,
byte[] splitPackedValues, byte[] splitPackedValues,
long[] leafBlockFPs, long[] leafBlockFPs,
List<Closeable> toCloseHeroically) throws IOException { List<Closeable> toCloseHeroically) throws IOException {
for(PathSlice slice : slices) {
assert slice.count == slices[0].count;
}
if (numDataDims == 1 && slices[0].writer instanceof OfflinePointWriter && sl
ices[0].count <= maxPointsSortInHeap) {
// Special case for 1D, to cutover to heap once we recurse deeply enough:
slices[0] = switchToHeap(slices[0], toCloseHeroically);
}
if (nodeID >= leafNodeOffset) { if (nodeID >= leafNodeOffset) {
// Leaf node: write block // Leaf node: write block
// We can write the block in any order so by default we write it sorted by the dimension that has the // We can write the block in any order so by default we write it sorted by the dimension that has the
// least number of unique bytes at commonPrefixLengths[dim], which makes c ompression more efficient // least number of unique bytes at commonPrefixLengths[dim], which makes c ompression more efficient
int sortedDim = 0; int sortedDim = 0;
int sortedDimCardinality = Integer.MAX_VALUE; int sortedDimCardinality = Integer.MAX_VALUE;
for (int dim=0;dim<numDataDims;dim++) { for (int dim=0;dim<numIndexDims;dim++) {
//create a slice if it does not exist
boolean created = false;
if (slices[dim] == null) {
createPathSlice(slices, dim);
created = true;
}
if (slices[dim].writer instanceof HeapPointWriter == false) { if (slices[dim].writer instanceof HeapPointWriter == false) {
// Adversarial cases can cause this, e.g. very lopsided data, all equa l points, such that we started // Adversarial cases can cause this, e.g. very lopsided data, all equa l points, such that we started
// offline, but then kept splitting only in one dimension, and so neve r had to rewrite into heap writer // offline, but then kept splitting only in one dimension, and so neve r had to rewrite into heap writer
PathSlice slice = slices[dim];
slices[dim] = switchToHeap(slices[dim], toCloseHeroically); slices[dim] = switchToHeap(slices[dim], toCloseHeroically);
if (created) {
slice.writer.destroy();
}
} }
PathSlice source = slices[dim]; PathSlice source = slices[dim];
HeapPointWriter heapSource = (HeapPointWriter) source.writer; HeapPointWriter heapSource = (HeapPointWriter) source.writer;
// Find common prefix by comparing first and last values, already sorted in this dimension: // Find common prefix by comparing first and last values, already sorted in this dimension:
heapSource.readPackedValue(Math.toIntExact(source.start), scratch1); heapSource.readPackedValue(Math.toIntExact(source.start), scratch1);
heapSource.readPackedValue(Math.toIntExact(source.start + source.count - 1), scratch2); heapSource.readPackedValue(Math.toIntExact(source.start + source.count - 1), scratch2);
int offset = dim * bytesPerDim; int offset = dim * bytesPerDim;
commonPrefixLengths[dim] = bytesPerDim; commonPrefixLengths[dim] = FutureArrays.mismatch(scratch1, offset, offse
for(int j=0;j<bytesPerDim;j++) { t + bytesPerDim, scratch2, offset, offset + bytesPerDim);
if (scratch1[offset+j] != scratch2[offset+j]) { if (commonPrefixLengths[dim] == -1) {
commonPrefixLengths[dim] = j; commonPrefixLengths[dim] = bytesPerDim;
break;
}
} }
int prefix = commonPrefixLengths[dim]; int prefix = commonPrefixLengths[dim];
if (prefix < bytesPerDim) { if (prefix < bytesPerDim) {
int cardinality = 1; int cardinality = 1;
byte previous = scratch1[offset + prefix]; byte previous = scratch1[offset + prefix];
for (long i = 1; i < source.count; ++i) { for (long i = 1; i < source.count; ++i) {
heapSource.readPackedValue(Math.toIntExact(source.start + i), scratc h2); heapSource.readPackedValue(Math.toIntExact(source.start + i), scratc h2);
byte b = scratch2[offset + prefix]; byte b = scratch2[offset + prefix];
assert Byte.toUnsignedInt(previous) <= Byte.toUnsignedInt(b); assert Byte.toUnsignedInt(previous) <= Byte.toUnsignedInt(b);
skipping to change at line 1745 skipping to change at line 1754
} }
} }
assert cardinality <= 256; assert cardinality <= 256;
if (cardinality < sortedDimCardinality) { if (cardinality < sortedDimCardinality) {
sortedDim = dim; sortedDim = dim;
sortedDimCardinality = cardinality; sortedDimCardinality = cardinality;
} }
} }
} }
PathSlice source = slices[sortedDim]; PathSlice dataDimPathSlice = null;
if (numDataDims != numIndexDims) {
HeapPointWriter heapSource = (HeapPointWriter) slices[0].writer;
int from = (int) slices[0].start;
int to = from + (int) slices[0].count;
Arrays.fill(commonPrefixLengths, numIndexDims, numDataDims, bytesPerDim)
;
heapSource.readPackedValue(from, scratch1);
for (int i = from + 1; i < to; ++i) {
heapSource.readPackedValue(i, scratch2);
for (int dim = numIndexDims; dim < numDataDims; dim++) {
final int offset = dim * bytesPerDim;
int dimensionPrefixLength = commonPrefixLengths[dim];
commonPrefixLengths[dim] = FutureArrays.mismatch(scratch1, offset, o
ffset + dimensionPrefixLength,
scratch2, offset, offset + dimensionPrefixLength);
if (commonPrefixLengths[dim] == -1) {
commonPrefixLengths[dim] = dimensionPrefixLength;
}
}
}
//handle case when all index dimensions contain the same value but not t
he data dimensions
if (commonPrefixLengths[sortedDim] == bytesPerDim) {
for (int dim = numIndexDims; dim < numDataDims; ++dim) {
if (commonPrefixLengths[dim] != bytesPerDim) {
sortedDim = dim;
//create a new slice in memory
dataDimPathSlice = switchToHeap(slices[0], toCloseHeroically);
sortHeapPointWriter((HeapPointWriter) dataDimPathSlice.writer, (in
t) dataDimPathSlice.count, sortedDim);
break;
}
}
}
}
PathSlice source = (dataDimPathSlice != null) ? dataDimPathSlice : slices[
sortedDim];
// We ensured that maxPointsSortInHeap was >= maxPointsInLeafNode, so we b etter be in heap at this point: // We ensured that maxPointsSortInHeap was >= maxPointsInLeafNode, so we b etter be in heap at this point:
HeapPointWriter heapSource = (HeapPointWriter) source.writer; HeapPointWriter heapSource = (HeapPointWriter) source.writer;
// Save the block file pointer: // Save the block file pointer:
leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer(); leafBlockFPs[nodeID - leafNodeOffset] = out.getFilePointer();
//System.out.println(" write leaf block @ fp=" + out.getFilePointer()); //System.out.println(" write leaf block @ fp=" + out.getFilePointer());
// Write docIDs first, as their own chunk, so that at intersect time we ca n add all docIDs w/o // Write docIDs first, as their own chunk, so that at intersect time we ca n add all docIDs w/o
// loading the values: // loading the values:
skipping to change at line 1794 skipping to change at line 1837
} else { } else {
// Inner node: partition/recurse // Inner node: partition/recurse
int splitDim; int splitDim;
if (numIndexDims > 1) { if (numIndexDims > 1) {
splitDim = split(minPackedValue, maxPackedValue, parentSplits); splitDim = split(minPackedValue, maxPackedValue, parentSplits);
} else { } else {
splitDim = 0; splitDim = 0;
} }
//We delete the created path slices at the same level
boolean deleteSplitDim = false;
if (slices[splitDim] == null) {
createPathSlice(slices, splitDim);
deleteSplitDim = true;
}
PathSlice source = slices[splitDim]; PathSlice source = slices[splitDim];
assert nodeID < splitPackedValues.length: "nodeID=" + nodeID + " splitValu es.length=" + splitPackedValues.length; assert nodeID < splitPackedValues.length: "nodeID=" + nodeID + " splitValu es.length=" + splitPackedValues.length;
// How many points will be in the left tree: // How many points will be in the left tree:
long rightCount = source.count / 2; long rightCount = source.count / 2;
long leftCount = source.count - rightCount; long leftCount = source.count - rightCount;
byte[] splitValue = markRightTree(rightCount, splitDim, source, ordBitSet) // When we are on this dim, below, we clear the ordBitSet:
; int dimToClear = numIndexDims - 1;
while (dimToClear >= 0) {
if (slices[dimToClear] != null && splitDim != dimToClear) {
break;
}
dimToClear--;
}
byte[] splitValue = (dimToClear == -1) ? markRightTree(rightCount, splitDi
m, source, null) : markRightTree(rightCount, splitDim, source, ordBitSet);
int address = nodeID * (1+bytesPerDim); int address = nodeID * (1+bytesPerDim);
splitPackedValues[address] = (byte) splitDim; splitPackedValues[address] = (byte) splitDim;
System.arraycopy(splitValue, 0, splitPackedValues, address + 1, bytesPerDi m); System.arraycopy(splitValue, 0, splitPackedValues, address + 1, bytesPerDi m);
// Partition all PathSlice that are not the split dim into sorted left and right sets, so we can recurse: // Partition all PathSlice that are not the split dim into sorted left and right sets, so we can recurse:
PathSlice[] leftSlices = new PathSlice[numDataDims]; PathSlice[] leftSlices = new PathSlice[numIndexDims];
PathSlice[] rightSlices = new PathSlice[numDataDims]; PathSlice[] rightSlices = new PathSlice[numIndexDims];
byte[] minSplitPackedValue = new byte[packedIndexBytesLength]; byte[] minSplitPackedValue = new byte[packedIndexBytesLength];
System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, packedIndexByt esLength); System.arraycopy(minPackedValue, 0, minSplitPackedValue, 0, packedIndexByt esLength);
byte[] maxSplitPackedValue = new byte[packedIndexBytesLength]; byte[] maxSplitPackedValue = new byte[packedIndexBytesLength];
System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, packedIndexByt esLength); System.arraycopy(maxPackedValue, 0, maxSplitPackedValue, 0, packedIndexByt esLength);
// When we are on this dim, below, we clear the ordBitSet: for(int dim=0;dim<numIndexDims;dim++) {
int dimToClear; if (slices[dim] == null) {
if (numDataDims - 1 == splitDim) { continue;
dimToClear = numDataDims - 2; }
} else {
dimToClear = numDataDims - 1;
}
for(int dim=0;dim<numDataDims;dim++) {
if (dim == splitDim) { if (dim == splitDim) {
// No need to partition on this dim since it's a simple slice of the i ncoming already sorted slice, and we // No need to partition on this dim since it's a simple slice of the i ncoming already sorted slice, and we
// will re-use its shared reader when visiting it as we recurse: // will re-use its shared reader when visiting it as we recurse:
leftSlices[dim] = new PathSlice(source.writer, source.start, leftCount ); leftSlices[dim] = new PathSlice(source.writer, source.start, leftCount );
rightSlices[dim] = new PathSlice(source.writer, source.start + leftCou nt, rightCount); rightSlices[dim] = new PathSlice(source.writer, source.start + leftCou nt, rightCount);
System.arraycopy(splitValue, 0, minSplitPackedValue, dim*bytesPerDim, bytesPerDim); System.arraycopy(splitValue, 0, minSplitPackedValue, dim*bytesPerDim, bytesPerDim);
System.arraycopy(splitValue, 0, maxSplitPackedValue, dim*bytesPerDim, bytesPerDim); System.arraycopy(splitValue, 0, maxSplitPackedValue, dim*bytesPerDim, bytesPerDim);
continue; continue;
} }
skipping to change at line 1863 skipping to change at line 1915
throw verifyChecksum(t, slices[dim].writer); throw verifyChecksum(t, slices[dim].writer);
} }
} }
parentSplits[splitDim]++; parentSplits[splitDim]++;
// Recurse on left tree: // Recurse on left tree:
build(2*nodeID, leafNodeOffset, leftSlices, build(2*nodeID, leafNodeOffset, leftSlices,
ordBitSet, out, ordBitSet, out,
minPackedValue, maxSplitPackedValue, parentSplits, minPackedValue, maxSplitPackedValue, parentSplits,
splitPackedValues, leafBlockFPs, toCloseHeroically); splitPackedValues, leafBlockFPs, toCloseHeroically);
for(int dim=0;dim<numDataDims;dim++) { for(int dim=0;dim<numIndexDims;dim++) {
// Don't destroy the dim we split on because we just re-used what our ca ller above gave us for that dim: // Don't destroy the dim we split on because we just re-used what our ca ller above gave us for that dim:
if (dim != splitDim) { if (dim != splitDim && slices[dim] != null) {
leftSlices[dim].writer.destroy(); leftSlices[dim].writer.destroy();
} }
} }
// TODO: we could "tail recurse" here? have our parent discard its refs a s we recurse right? // TODO: we could "tail recurse" here? have our parent discard its refs a s we recurse right?
// Recurse on right tree: // Recurse on right tree:
build(2*nodeID+1, leafNodeOffset, rightSlices, build(2*nodeID+1, leafNodeOffset, rightSlices,
ordBitSet, out, ordBitSet, out,
minSplitPackedValue, maxPackedValue, parentSplits, minSplitPackedValue, maxPackedValue, parentSplits,
splitPackedValues, leafBlockFPs, toCloseHeroically); splitPackedValues, leafBlockFPs, toCloseHeroically);
for(int dim=0;dim<numDataDims;dim++) { for(int dim=0;dim<numIndexDims;dim++) {
// Don't destroy the dim we split on because we just re-used what our ca ller above gave us for that dim: // Don't destroy the dim we split on because we just re-used what our ca ller above gave us for that dim:
if (dim != splitDim) { if (dim != splitDim && slices[dim] != null) {
rightSlices[dim].writer.destroy(); rightSlices[dim].writer.destroy();
} }
} }
parentSplits[splitDim]--; parentSplits[splitDim]--;
if (deleteSplitDim) {
slices[splitDim].writer.destroy();
}
}
}
private void createPathSlice(PathSlice[] slices, int dim) throws IOException{
assert slices[dim] == null;
PathSlice current = null;
for(PathSlice slice : slices) {
if (slice != null) {
current = slice;
break;
}
}
if (current == null) {
slices[dim] = new PathSlice(sort(dim), 0, pointCount);
} else {
slices[dim] = new PathSlice(sort(dim, current.writer, current.start, curr
ent.count), 0, current.count);
} }
} }
// only called from assert // only called from assert
private boolean valuesInOrderAndBounds(int count, int sortedDim, byte[] minPac kedValue, byte[] maxPackedValue, private boolean valuesInOrderAndBounds(int count, int sortedDim, byte[] minPac kedValue, byte[] maxPackedValue,
IntFunction<BytesRef> values, int[] docs, int docsOffset) throws IOExcepti on { IntFunction<BytesRef> values, int[] docs, int docsOffset) throws IOExcepti on {
byte[] lastPackedValue = new byte[packedBytesLength]; byte[] lastPackedValue = new byte[packedBytesLength];
int lastDoc = -1; int lastDoc = -1;
for (int i=0;i<count;i++) { for (int i=0;i<count;i++) {
BytesRef packedValue = values.apply(i); BytesRef packedValue = values.apply(i);
 End of changes. 46 change blocks. 
159 lines changed or deleted 246 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)