This snippet is about sorting a set of strings using merge sort. However, this version is tailored for strings as it relies on lcp
values:
Given two strings \$A\$ and \$B\$, \$lcp(A, B)\$ is the length of the longest prefix shared by both \$A\$ and \$B\$. The cool point to note here is that if you know at least a lower bound of \$lcp(A, B)\$ (say, \$k < lcp(A, B)\$), whenever comparing \$A\$ and \$B\$, you can skip the first \$k\$ characters, and this string merge sort relies on these values in order to speed up the computation.
StringMergesort.java:
package net.coderodde.util;
import java.util.Arrays;
import java.util.Random;
public class StringMergesort {
private static final int ALPHABET_SIZE = 26;
private static final class ComparisonResult {
int cmpValue;
int lcp;
}
public static void sort(String[] array) {
sort(array, 0, array.length);
}
public static void sort(String[] array, int fromIndex, int toIndex) {
if (toIndex - fromIndex < 2) {
return;
}
String[] auxArray = array.clone();
int[] sourceLCPArray = new int[auxArray.length];
int[] targetLCPArray = new int[auxArray.length];
ComparisonResult result = new ComparisonResult();
sortImpl(auxArray,
array,
sourceLCPArray,
targetLCPArray,
fromIndex,
toIndex,
result);
}
private static void lcpCompare(String a,
String b,
int k,
ComparisonResult comparisonResult) {
int length = Math.min(a.length(), b.length());
for (int i = k; i < length; ++i) {
char ach = a.charAt(i);
char bch = b.charAt(i);
if (ach != bch) {
comparisonResult.cmpValue = Character.compare(ach, bch);
comparisonResult.lcp = i;
return;
}
}
comparisonResult.lcp = length;
if (a.length() > length) {
comparisonResult.cmpValue = 1;
} else if (a.length() < length) {
comparisonResult.cmpValue = -1;
} else {
comparisonResult.cmpValue = 0;
}
}
private static void sortImpl(String[] source,
String[] target,
int[] sourceLCPArray,
int[] targetLCPArray,
int fromIndex,
int toIndex,
ComparisonResult result) {
int rangeLength = toIndex - fromIndex;
if (rangeLength < 2) {
return;
}
int middle = fromIndex + ((toIndex - fromIndex) >> 1);
sortImpl(target,
source,
targetLCPArray,
sourceLCPArray,
fromIndex,
middle,
result);
sortImpl(target,
source,
targetLCPArray,
sourceLCPArray,
middle,
toIndex,
result);
merge(source,
target,
sourceLCPArray,
targetLCPArray,
fromIndex,
middle - fromIndex,
toIndex - middle,
result);
}
private static void merge(String[] source,
String[] target,
int[] sourceLCPArray,
int[] targetLCPArray,
int offset,
int leftRangeLength,
int rightRangeLength,
ComparisonResult result) {
int left = offset;
int leftBound = offset + leftRangeLength;
int right = leftBound;
int rightBound = right + rightRangeLength;
int targetIndex = offset;
while (left < leftBound && right < rightBound) {
int leftLCP = sourceLCPArray[left];
int rightLCP = sourceLCPArray[right];
if (leftLCP > rightLCP) {
target[targetIndex] = source[left];
targetLCPArray[targetIndex++] = sourceLCPArray[left++];
} else if (rightLCP > leftLCP) {
target[targetIndex] = source[right];
targetLCPArray[targetIndex++] = sourceLCPArray[right++];
} else {
lcpCompare(source[left], source[right], leftLCP, result);
if (result.cmpValue <= 0) {
target[targetIndex] = source[left];
targetLCPArray[targetIndex++] = sourceLCPArray[left++];
sourceLCPArray[right] = result.lcp;
} else {
target[targetIndex] = source[right];
targetLCPArray[targetIndex++] = sourceLCPArray[right++];
sourceLCPArray[left] = result.lcp;
}
}
}
while (left < leftBound) {
target[targetIndex] = source[left];
targetLCPArray[targetIndex] = sourceLCPArray[left];
targetIndex++;
left++;
}
while (right < rightBound) {
target[targetIndex] = source[right];
targetLCPArray[targetIndex] = sourceLCPArray[right];
targetIndex++;
right++;
}
}
public static void main(String[] args) {
long seed = System.nanoTime();
Random random = new Random(seed);
String[] array1 = createRandomStringArray(500_000, 300, random);
String[] array2 = array1.clone();
System.out.println("Seed = " + seed);
long startTime = System.nanoTime();
Arrays.sort(array1);
long endTime = System.nanoTime();
System.out.format("Arrays.sort in %.2f milliseconds.\n",
(endTime - startTime) / 1e6);
startTime = System.nanoTime();
StringMergesort.sort(array2);
endTime = System.nanoTime();
System.out.format("StringMergesort.sort in %.2f milliseconds.\n",
(endTime - startTime) / 1e6);
System.out.println("Arrays equal: " + Arrays.equals(array1, array2));
}
private static String[] createRandomStringArray(int size,
int maxLength,
Random random) {
String[] ret = new String[size];
for (int i = 0; i < size; ++i) {
ret[i] = randomString(maxLength, random);
}
return ret;
}
private static String randomString(int maxLength, Random random) {
int length = random.nextInt(maxLength + 1);
StringBuilder sb = new StringBuilder(length);
for (int i = 0; i < length; ++i) {
sb.append((char)('a' + random.nextInt(ALPHABET_SIZE)));
}
return sb.toString();
}
}
My performance figures are:
Seed = 5896859395728 Arrays.sort in 1220.41 milliseconds. StringMergesort.sort in 669.97 milliseconds. Arrays equal: true
Is there anything to improve here? Once again, I would like to hear comments on naming conventions, coding style, performance, and optimisation opportunities.