RawText: improve performance of isCrLfText and isBinary

Inline the function isBinary(byte, byte), and reduce several duplicated checks in it, for better performance. Change-Id: Ida855ed4fd7456d8fb7ed68f3af2dbfa0e25897c

RawText: improve performance of isCrLfText and isBinary
6cf5d194 · XenoAmess · Thomas Wolf · bf48fb73 · 6cf5d194 · 6cf5d194
Commit 6cf5d194 authored 8 months ago by XenoAmess Committed by Thomas Wolf 6 months ago
--- a/org.eclipse.jgit.benchmarks/src/org/eclipse/jgit/benchmarks/RawTextBenchmark.java
+++ b/org.eclipse.jgit.benchmarks/src/org/eclipse/jgit/benchmarks/RawTextBenchmark.java
+/*
+ * Copyright (C) 2022, Matthias Sohn <matthias.sohn@sap.com> and others
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Distribution License v. 1.0 which is available at
+ * https://www.eclipse.org/org/documents/edl-v10.php.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+package org.eclipse.jgit.benchmarks;
+
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.RunnerException;
+import org.openjdk.jmh.runner.options.Options;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+import java.util.concurrent.TimeUnit;
+
+import static org.eclipse.jgit.diff.RawText.getBufferSize;
+import static org.eclipse.jgit.diff.RawText.isBinary;
+import static org.eclipse.jgit.diff.RawText.isCrLfText;
+
+@State(Scope.Thread)
+public class RawTextBenchmark {
+
+	@State(Scope.Benchmark)
+	public static class BenchmarkState {
+
+		@Param({"1", "2", "3", "4", "5", "6"})
+		int testIndex;
+
+		@Param({"false", "true"})
+		boolean complete;
+
+		byte[] bytes;
+
+		@Setup
+		public void setupBenchmark() {
+			switch (testIndex) {
+				case 1: {
+					byte[] tmpBytes = "a".repeat(102400).getBytes();
+					bytes = tmpBytes;
+					break;
+				}
+				case 2: {
+					byte[] tmpBytes = "a".repeat(102400).getBytes();
+					byte[] tmpBytes2 = new byte[tmpBytes.length + 1];
+					System.arraycopy(tmpBytes, 0, tmpBytes2, 0, tmpBytes.length);
+					tmpBytes2[500] = '\0';
+					tmpBytes2[tmpBytes.length] = '\0';
+					bytes = tmpBytes2;
+					break;
+				}
+				case 3: {
+					byte[] tmpBytes = "a".repeat(102400).getBytes();
+					byte[] tmpBytes2 = new byte[tmpBytes.length + 1];
+					System.arraycopy(tmpBytes, 0, tmpBytes2, 0, tmpBytes.length);
+					tmpBytes2[500] = '\r';
+					tmpBytes2[tmpBytes.length] = '\r';
+					bytes = tmpBytes2;
+					break;
+				}
+				case 4: {
+					byte[] tmpBytes = "a".repeat(102400).getBytes();
+					byte[] tmpBytes2 = new byte[tmpBytes.length + 1];
+					System.arraycopy(tmpBytes, 0, tmpBytes2, 0, tmpBytes.length);
+					tmpBytes2[499] = '\r';
+					tmpBytes2[500] = '\n';
+					tmpBytes2[tmpBytes.length - 1] = '\r';
+					tmpBytes2[tmpBytes.length] = '\n';
+					bytes = tmpBytes2;
+					break;
+				}
+				case 5: {
+					byte[] tmpBytes = "a".repeat(102400).getBytes();
+					tmpBytes[0] = '\0';
+					bytes = tmpBytes;
+					break;
+				}
+				case 6: {
+					byte[] tmpBytes = "a".repeat(102400).getBytes();
+					tmpBytes[0] = '\r';
+					bytes = tmpBytes;
+					break;
+				}
+				default:
+			}
+		}
+
+		@TearDown
+		public void teardown() {
+		}
+	}
+
+	@Benchmark
+	@BenchmarkMode({Mode.AverageTime})
+	@OutputTimeUnit(TimeUnit.NANOSECONDS)
+	@Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+	@Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+	@Fork(1)
+	public void testIsCrLfTextOld(Blackhole blackhole, BenchmarkState state) {
+		blackhole.consume(
+				isCrLfTextOld(
+						state.bytes,
+						state.bytes.length,
+						state.complete
+				)
+		);
+	}
+
+	@Benchmark
+	@BenchmarkMode({Mode.AverageTime})
+	@OutputTimeUnit(TimeUnit.NANOSECONDS)
+	@Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+	@Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+	@Fork(1)
+	public void testIsCrLfTextNewCandidate1(Blackhole blackhole, BenchmarkState state) {
+		blackhole.consume(
+				isCrLfTextNewCandidate1(
+						state.bytes,
+						state.bytes.length,
+						state.complete
+				)
+		);
+	}
+
+	@Benchmark
+	@BenchmarkMode({Mode.AverageTime})
+	@OutputTimeUnit(TimeUnit.NANOSECONDS)
+	@Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+	@Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+	@Fork(1)
+	public void testIsCrLfTextNewCandidate2(Blackhole blackhole, BenchmarkState state) {
+		blackhole.consume(
+				isCrLfTextNewCandidate2(
+						state.bytes,
+						state.bytes.length,
+						state.complete
+				)
+		);
+	}
+
+	@Benchmark
+	@BenchmarkMode({Mode.AverageTime})
+	@OutputTimeUnit(TimeUnit.NANOSECONDS)
+	@Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+	@Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+	@Fork(1)
+	public void testIsCrLfTextNewCandidate3(Blackhole blackhole, BenchmarkState state) {
+		blackhole.consume(
+				isCrLfTextNewCandidate3(
+						state.bytes,
+						state.bytes.length,
+						state.complete
+				)
+		);
+	}
+
+	@Benchmark
+	@BenchmarkMode({Mode.AverageTime})
+	@OutputTimeUnit(TimeUnit.NANOSECONDS)
+	@Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+	@Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+	@Fork(1)
+	public void testIsCrLfTextNew(Blackhole blackhole, BenchmarkState state) {
+		blackhole.consume(
+				isCrLfText(
+						state.bytes,
+						state.bytes.length,
+						state.complete
+				)
+		);
+	}
+
+	@Benchmark
+	@BenchmarkMode({Mode.AverageTime})
+	@OutputTimeUnit(TimeUnit.NANOSECONDS)
+	@Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+	@Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+	@Fork(1)
+	public void testIsBinaryOld(Blackhole blackhole, BenchmarkState state) {
+		blackhole.consume(
+				isBinaryOld(
+						state.bytes,
+						state.bytes.length,
+						state.complete
+				)
+		);
+	}
+
+
+	@Benchmark
+	@BenchmarkMode({Mode.AverageTime})
+	@OutputTimeUnit(TimeUnit.NANOSECONDS)
+	@Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+	@Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
+	@Fork(1)
+	public void testIsBinaryNew(Blackhole blackhole, BenchmarkState state) {
+		blackhole.consume(
+				isBinary(
+						state.bytes,
+						state.bytes.length,
+						state.complete
+				)
+		);
+	}
+
+
+	/**
+	 * Determine heuristically whether a byte array represents binary (as
+	 * opposed to text) content.
+	 *
+	 * @param raw
+	 *			the raw file content.
+	 * @param length
+	 *			number of bytes in {@code raw} to evaluate. This should be
+	 *			{@code raw.length} unless {@code raw} was over-allocated by
+	 *			the caller.
+	 * @param complete
+	 *			whether {@code raw} contains the whole data
+	 * @return true if raw is likely to be a binary file, false otherwise
+	 * @since 6.0
+	 */
+	public static boolean isBinaryOld(byte[] raw, int length, boolean complete) {
+		// Similar heuristic as C Git. Differences:
+		// - limited buffer size; may be only the beginning of a large blob
+		// - no counting of printable vs. non-printable bytes < 0x20 and 0x7F
+		int maxLength = getBufferSize();
+		boolean isComplete = complete;
+		if (length > maxLength) {
+			// We restrict the length in all cases to getBufferSize() to get
+			// predictable behavior. Sometimes we load streams, and sometimes we
+			// have the full data in memory. With streams, we never look at more
+			// than the first getBufferSize() bytes. If we looked at more when
+			// we have the full data, different code paths in JGit might come to
+			// different conclusions.
+			length = maxLength;
+			isComplete = false;
+		}
+		byte last = 'x'; // Just something inconspicuous.
+		for (int ptr = 0; ptr < length; ptr++) {
+			byte curr = raw[ptr];
+			if (isBinary(curr, last)) {
+				return true;
+			}
+			last = curr;
+		}
+		if (isComplete) {
+			// Buffer contains everything...
+			return last == '\r'; // ... so this must be a lone CR
+		}
+		return false;
+	}
+
+	/**
+	 * Determine heuristically whether a byte array represents text content
+	 * using CR-LF as line separator.
+	 *
+	 * @param raw	  the raw file content.
+	 * @param length   number of bytes in {@code raw} to evaluate.
+	 * @param complete whether {@code raw} contains the whole data
+	 * @return {@code true} if raw is likely to be CR-LF delimited text,
+	 * {@code false} otherwise
+	 * @since 6.0
+	 */
+	public static boolean isCrLfTextOld(byte[] raw, int length, boolean complete) {
+		boolean has_crlf = false;
+		byte last = 'x'; // Just something inconspicuous
+		for (int ptr = 0; ptr < length; ptr++) {
+			byte curr = raw[ptr];
+			if (isBinary(curr, last)) {
+				return false;
+			}
+			if (curr == '\n' && last == '\r') {
+				has_crlf = true;
+			}
+			last = curr;
+		}
+		if (last == '\r') {
+			if (complete) {
+				// Lone CR: it's binary after all.
+				return false;
+			}
+			// Tough call. If the next byte, which we don't have, would be a
+			// '\n', it'd be a CR-LF text, otherwise it'd be binary. Just decide
+			// based on what we already scanned; it wasn't binary until now.
+		}
+		return has_crlf;
+	}
+
+	/**
+	 * Determine heuristically whether a byte array represents text content
+	 * using CR-LF as line separator.
+	 *
+	 * @param raw
+	 *			the raw file content.
+	 * @param length
+	 *			number of bytes in {@code raw} to evaluate.
+	 * @return {@code true} if raw is likely to be CR-LF delimited text,
+	 *		 {@code false} otherwise
+	 * @param complete
+	 *			whether {@code raw} contains the whole data
+	 * @since 6.0
+	 */
+	public static boolean isCrLfTextNewCandidate1(byte[] raw, int length, boolean complete) {
+		boolean has_crlf = false;
+
+		// first detect empty
+		if (length <= 0) {
+			return false;
+		}
+
+		// next detect '\0'
+		for (int reversePtr = length - 1; reversePtr >= 0; --reversePtr) {
+			if (raw[reversePtr] == '\0') {
+				return false;
+			}
+		}
+
+		// if '\r' be last, then if complete then return non-crlf
+		if (raw[length - 1] == '\r' && complete) {
+			return false;
+		}
+
+		for (int ptr = 0; ptr < length - 1; ptr++) {
+			byte curr = raw[ptr];
+			if (curr == '\r') {
+				byte next = raw[ptr + 1];
+				if (next != '\n') {
+					return false;
+				}
+				// else
+				// we have crlf here
+				has_crlf = true;
+				// as next is '\n', it can never be '\r', just skip it from next check
+				++ptr;
+			}
+		}
+
+		return has_crlf;
+	}
+
+	/**
+	 * Determine heuristically whether a byte array represents text content
+	 * using CR-LF as line separator.
+	 *
+	 * @param raw
+	 *			the raw file content.
+	 * @param length
+	 *			number of bytes in {@code raw} to evaluate.
+	 * @return {@code true} if raw is likely to be CR-LF delimited text,
+	 *		 {@code false} otherwise
+	 * @param complete
+	 *			whether {@code raw} contains the whole data
+	 * @since 6.0
+	 */
+	public static boolean isCrLfTextNewCandidate2(byte[] raw, int length, boolean complete) {
+		boolean has_crlf = false;
+
+		// first detect empty
+		if (length <= 0) {
+			return false;
+		}
+
+		// if '\r' be last, then if complete then return non-crlf
+		byte last = raw[length - 1];
+		if (last == '\0' || last == '\r' && complete) {
+			return false;
+		}
+
+		for (int ptr = 0; ptr < length - 1; ptr++) {
+			byte b = raw[ptr];
+			switch (b) {
+				case  '\0':
+					return false;
+				case '\r': {
+					++ptr;
+					b = raw[ptr];
+					if (b != '\n') {
+						return false;
+					}
+					// else
+					// we have crlf here
+					has_crlf = true;
+					// as next is '\n', it can never be '\r', just skip it from next check
+					break;
+				}
+				default:
+					// do nothing;
+					break;
+			}
+		}
+
+		return has_crlf;
+	}
+
+	/**
+	 * Determine heuristically whether a byte array represents text content
+	 * using CR-LF as line separator.
+	 *
+	 * @param raw
+	 *			the raw file content.
+	 * @param length
+	 *			number of bytes in {@code raw} to evaluate.
+	 * @return {@code true} if raw is likely to be CR-LF delimited text,
+	 *		 {@code false} otherwise
+	 * @param complete
+	 *			whether {@code raw} contains the whole data
+	 * @since 6.0
+	 */
+	public static boolean isCrLfTextNewCandidate3(byte[] raw, int length, boolean complete) {
+		boolean has_crlf = false;
+
+		int ptr = -1;
+		byte current;
+		while (ptr < length - 2) {
+			current = raw[++ptr];
+			if ('\0' == current || '\r' == current && (raw[++ptr] != '\n' || !(has_crlf = true))) {
+				return false;
+			}
+		}
+
+		if (ptr == length - 2) {
+			// if '\r' be last, then if isComplete then return binary
+			current = raw[++ptr];
+			if('\0' == current || '\r' == current && complete){
+				return false;
+			}
+		}
+
+		return has_crlf;
+	}
+
+
+	public static void main(String[] args) throws RunnerException {
+		Options opt = new OptionsBuilder()
+				.include(RawTextBenchmark.class.getSimpleName())
+				.forks(1).jvmArgs("-ea").build();
+		new Runner(opt).run();
+	}
+}
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java
@@ -360,18 +360,22 @@ public static boolean isBinary(byte[] raw, int length, boolean complete) {
 			length = maxLength;
 			isComplete = false;
 		}
-		byte last = 'x'; // Just something inconspicuous.
-		for (int ptr = 0; ptr < length; ptr++) {
-			byte curr = raw[ptr];
-			if (isBinary(curr, last)) {
+
+		int ptr = -1;
+		byte current;
+		while (ptr < length - 2) {
+			current = raw[++ptr];
+			if ('\0' == current || '\r' == current && raw[++ptr] != '\n') {
 				return true;
 			}
-			last = curr;
 		}
-		if (isComplete) {
-			// Buffer contains everything...
-			return last == '\r'; // ... so this must be a lone CR
+
+		if (ptr == length - 2) {
+			// if '\r' be last, then if isComplete then return binary
+			current = raw[++ptr];
+			return '\0' == current || '\r' == current && isComplete;
 		}
+
 		return false;
 	}

@@ -467,26 +471,30 @@ public static boolean isCrLfText(byte[] raw, int length) {
 	 */
 	public static boolean isCrLfText(byte[] raw, int length, boolean complete) {
 		boolean has_crlf = false;
-		byte last = 'x'; // Just something inconspicuous
-		for (int ptr = 0; ptr < length; ptr++) {
-			byte curr = raw[ptr];
-			if (isBinary(curr, last)) {
+
+		int ptr = -1;
+		byte current;
+		while (ptr < length - 2) {
+			current = raw[++ptr];
+			if (current == '\0') {
+				return false;
+			}
+			if (current == '\r') {
+				if (raw[++ptr] != '\n') {
 					return false;
 				}
-			if (curr == '\n' && last == '\r') {
 				has_crlf = true;
 			}
-			last = curr;
 		}
-		if (last == '\r') {
-			if (complete) {
-				// Lone CR: it's binary after all.
+
+		if (ptr == length - 2) {
+			// if '\r' be last, then if isComplete then return binary
+			current = raw[++ptr];
+			if('\0' == current || '\r' == current && complete){
 				return false;
 			}
-			// Tough call. If the next byte, which we don't have, would be a
-			// '\n', it'd be a CR-LF text, otherwise it'd be binary. Just decide
-			// based on what we already scanned; it wasn't binary until now.
 		}
+
 		return has_crlf;
 	}

@@ -578,4 +586,5 @@ public static RawText load(ObjectLoader ldr, int threshold)
 			return new RawText(data, RawParseUtils.lineMapOrBinary(data, 0, (int) sz));
 		}
 	}
+
 }