Skip to content
Snippets Groups Projects
Commit 6cf5d194 authored by XenoAmess's avatar XenoAmess Committed by Thomas Wolf
Browse files

RawText: improve performance of isCrLfText and isBinary

Inline the function isBinary(byte, byte), and reduce several duplicated
checks in it, for better performance.

Change-Id: Ida855ed4fd7456d8fb7ed68f3af2dbfa0e25897c
parent bf48fb73
No related branches found
No related tags found
No related merge requests found
/*
* Copyright (C) 2022, Matthias Sohn <matthias.sohn@sap.com> and others
*
* This program and the accompanying materials are made available under the
* terms of the Eclipse Distribution License v. 1.0 which is available at
* https://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
package org.eclipse.jgit.benchmarks;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.TearDown;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.RunnerException;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;
import java.util.concurrent.TimeUnit;
import static org.eclipse.jgit.diff.RawText.getBufferSize;
import static org.eclipse.jgit.diff.RawText.isBinary;
import static org.eclipse.jgit.diff.RawText.isCrLfText;
@State(Scope.Thread)
public class RawTextBenchmark {
@State(Scope.Benchmark)
public static class BenchmarkState {
@Param({"1", "2", "3", "4", "5", "6"})
int testIndex;
@Param({"false", "true"})
boolean complete;
byte[] bytes;
@Setup
public void setupBenchmark() {
switch (testIndex) {
case 1: {
byte[] tmpBytes = "a".repeat(102400).getBytes();
bytes = tmpBytes;
break;
}
case 2: {
byte[] tmpBytes = "a".repeat(102400).getBytes();
byte[] tmpBytes2 = new byte[tmpBytes.length + 1];
System.arraycopy(tmpBytes, 0, tmpBytes2, 0, tmpBytes.length);
tmpBytes2[500] = '\0';
tmpBytes2[tmpBytes.length] = '\0';
bytes = tmpBytes2;
break;
}
case 3: {
byte[] tmpBytes = "a".repeat(102400).getBytes();
byte[] tmpBytes2 = new byte[tmpBytes.length + 1];
System.arraycopy(tmpBytes, 0, tmpBytes2, 0, tmpBytes.length);
tmpBytes2[500] = '\r';
tmpBytes2[tmpBytes.length] = '\r';
bytes = tmpBytes2;
break;
}
case 4: {
byte[] tmpBytes = "a".repeat(102400).getBytes();
byte[] tmpBytes2 = new byte[tmpBytes.length + 1];
System.arraycopy(tmpBytes, 0, tmpBytes2, 0, tmpBytes.length);
tmpBytes2[499] = '\r';
tmpBytes2[500] = '\n';
tmpBytes2[tmpBytes.length - 1] = '\r';
tmpBytes2[tmpBytes.length] = '\n';
bytes = tmpBytes2;
break;
}
case 5: {
byte[] tmpBytes = "a".repeat(102400).getBytes();
tmpBytes[0] = '\0';
bytes = tmpBytes;
break;
}
case 6: {
byte[] tmpBytes = "a".repeat(102400).getBytes();
tmpBytes[0] = '\r';
bytes = tmpBytes;
break;
}
default:
}
}
@TearDown
public void teardown() {
}
}
@Benchmark
@BenchmarkMode({Mode.AverageTime})
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
@Fork(1)
public void testIsCrLfTextOld(Blackhole blackhole, BenchmarkState state) {
blackhole.consume(
isCrLfTextOld(
state.bytes,
state.bytes.length,
state.complete
)
);
}
@Benchmark
@BenchmarkMode({Mode.AverageTime})
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
@Fork(1)
public void testIsCrLfTextNewCandidate1(Blackhole blackhole, BenchmarkState state) {
blackhole.consume(
isCrLfTextNewCandidate1(
state.bytes,
state.bytes.length,
state.complete
)
);
}
@Benchmark
@BenchmarkMode({Mode.AverageTime})
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
@Fork(1)
public void testIsCrLfTextNewCandidate2(Blackhole blackhole, BenchmarkState state) {
blackhole.consume(
isCrLfTextNewCandidate2(
state.bytes,
state.bytes.length,
state.complete
)
);
}
@Benchmark
@BenchmarkMode({Mode.AverageTime})
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
@Fork(1)
public void testIsCrLfTextNewCandidate3(Blackhole blackhole, BenchmarkState state) {
blackhole.consume(
isCrLfTextNewCandidate3(
state.bytes,
state.bytes.length,
state.complete
)
);
}
@Benchmark
@BenchmarkMode({Mode.AverageTime})
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
@Fork(1)
public void testIsCrLfTextNew(Blackhole blackhole, BenchmarkState state) {
blackhole.consume(
isCrLfText(
state.bytes,
state.bytes.length,
state.complete
)
);
}
@Benchmark
@BenchmarkMode({Mode.AverageTime})
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
@Fork(1)
public void testIsBinaryOld(Blackhole blackhole, BenchmarkState state) {
blackhole.consume(
isBinaryOld(
state.bytes,
state.bytes.length,
state.complete
)
);
}
@Benchmark
@BenchmarkMode({Mode.AverageTime})
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS)
@Fork(1)
public void testIsBinaryNew(Blackhole blackhole, BenchmarkState state) {
blackhole.consume(
isBinary(
state.bytes,
state.bytes.length,
state.complete
)
);
}
/**
* Determine heuristically whether a byte array represents binary (as
* opposed to text) content.
*
* @param raw
* the raw file content.
* @param length
* number of bytes in {@code raw} to evaluate. This should be
* {@code raw.length} unless {@code raw} was over-allocated by
* the caller.
* @param complete
* whether {@code raw} contains the whole data
* @return true if raw is likely to be a binary file, false otherwise
* @since 6.0
*/
public static boolean isBinaryOld(byte[] raw, int length, boolean complete) {
// Similar heuristic as C Git. Differences:
// - limited buffer size; may be only the beginning of a large blob
// - no counting of printable vs. non-printable bytes < 0x20 and 0x7F
int maxLength = getBufferSize();
boolean isComplete = complete;
if (length > maxLength) {
// We restrict the length in all cases to getBufferSize() to get
// predictable behavior. Sometimes we load streams, and sometimes we
// have the full data in memory. With streams, we never look at more
// than the first getBufferSize() bytes. If we looked at more when
// we have the full data, different code paths in JGit might come to
// different conclusions.
length = maxLength;
isComplete = false;
}
byte last = 'x'; // Just something inconspicuous.
for (int ptr = 0; ptr < length; ptr++) {
byte curr = raw[ptr];
if (isBinary(curr, last)) {
return true;
}
last = curr;
}
if (isComplete) {
// Buffer contains everything...
return last == '\r'; // ... so this must be a lone CR
}
return false;
}
/**
* Determine heuristically whether a byte array represents text content
* using CR-LF as line separator.
*
* @param raw the raw file content.
* @param length number of bytes in {@code raw} to evaluate.
* @param complete whether {@code raw} contains the whole data
* @return {@code true} if raw is likely to be CR-LF delimited text,
* {@code false} otherwise
* @since 6.0
*/
public static boolean isCrLfTextOld(byte[] raw, int length, boolean complete) {
boolean has_crlf = false;
byte last = 'x'; // Just something inconspicuous
for (int ptr = 0; ptr < length; ptr++) {
byte curr = raw[ptr];
if (isBinary(curr, last)) {
return false;
}
if (curr == '\n' && last == '\r') {
has_crlf = true;
}
last = curr;
}
if (last == '\r') {
if (complete) {
// Lone CR: it's binary after all.
return false;
}
// Tough call. If the next byte, which we don't have, would be a
// '\n', it'd be a CR-LF text, otherwise it'd be binary. Just decide
// based on what we already scanned; it wasn't binary until now.
}
return has_crlf;
}
/**
* Determine heuristically whether a byte array represents text content
* using CR-LF as line separator.
*
* @param raw
* the raw file content.
* @param length
* number of bytes in {@code raw} to evaluate.
* @return {@code true} if raw is likely to be CR-LF delimited text,
* {@code false} otherwise
* @param complete
* whether {@code raw} contains the whole data
* @since 6.0
*/
public static boolean isCrLfTextNewCandidate1(byte[] raw, int length, boolean complete) {
boolean has_crlf = false;
// first detect empty
if (length <= 0) {
return false;
}
// next detect '\0'
for (int reversePtr = length - 1; reversePtr >= 0; --reversePtr) {
if (raw[reversePtr] == '\0') {
return false;
}
}
// if '\r' be last, then if complete then return non-crlf
if (raw[length - 1] == '\r' && complete) {
return false;
}
for (int ptr = 0; ptr < length - 1; ptr++) {
byte curr = raw[ptr];
if (curr == '\r') {
byte next = raw[ptr + 1];
if (next != '\n') {
return false;
}
// else
// we have crlf here
has_crlf = true;
// as next is '\n', it can never be '\r', just skip it from next check
++ptr;
}
}
return has_crlf;
}
/**
* Determine heuristically whether a byte array represents text content
* using CR-LF as line separator.
*
* @param raw
* the raw file content.
* @param length
* number of bytes in {@code raw} to evaluate.
* @return {@code true} if raw is likely to be CR-LF delimited text,
* {@code false} otherwise
* @param complete
* whether {@code raw} contains the whole data
* @since 6.0
*/
public static boolean isCrLfTextNewCandidate2(byte[] raw, int length, boolean complete) {
boolean has_crlf = false;
// first detect empty
if (length <= 0) {
return false;
}
// if '\r' be last, then if complete then return non-crlf
byte last = raw[length - 1];
if (last == '\0' || last == '\r' && complete) {
return false;
}
for (int ptr = 0; ptr < length - 1; ptr++) {
byte b = raw[ptr];
switch (b) {
case '\0':
return false;
case '\r': {
++ptr;
b = raw[ptr];
if (b != '\n') {
return false;
}
// else
// we have crlf here
has_crlf = true;
// as next is '\n', it can never be '\r', just skip it from next check
break;
}
default:
// do nothing;
break;
}
}
return has_crlf;
}
/**
* Determine heuristically whether a byte array represents text content
* using CR-LF as line separator.
*
* @param raw
* the raw file content.
* @param length
* number of bytes in {@code raw} to evaluate.
* @return {@code true} if raw is likely to be CR-LF delimited text,
* {@code false} otherwise
* @param complete
* whether {@code raw} contains the whole data
* @since 6.0
*/
public static boolean isCrLfTextNewCandidate3(byte[] raw, int length, boolean complete) {
boolean has_crlf = false;
int ptr = -1;
byte current;
while (ptr < length - 2) {
current = raw[++ptr];
if ('\0' == current || '\r' == current && (raw[++ptr] != '\n' || !(has_crlf = true))) {
return false;
}
}
if (ptr == length - 2) {
// if '\r' be last, then if isComplete then return binary
current = raw[++ptr];
if('\0' == current || '\r' == current && complete){
return false;
}
}
return has_crlf;
}
public static void main(String[] args) throws RunnerException {
Options opt = new OptionsBuilder()
.include(RawTextBenchmark.class.getSimpleName())
.forks(1).jvmArgs("-ea").build();
new Runner(opt).run();
}
}
......@@ -360,18 +360,22 @@ public static boolean isBinary(byte[] raw, int length, boolean complete) {
length = maxLength;
isComplete = false;
}
byte last = 'x'; // Just something inconspicuous.
for (int ptr = 0; ptr < length; ptr++) {
byte curr = raw[ptr];
if (isBinary(curr, last)) {
int ptr = -1;
byte current;
while (ptr < length - 2) {
current = raw[++ptr];
if ('\0' == current || '\r' == current && raw[++ptr] != '\n') {
return true;
}
last = curr;
}
if (isComplete) {
// Buffer contains everything...
return last == '\r'; // ... so this must be a lone CR
if (ptr == length - 2) {
// if '\r' be last, then if isComplete then return binary
current = raw[++ptr];
return '\0' == current || '\r' == current && isComplete;
}
return false;
}
......@@ -467,26 +471,30 @@ public static boolean isCrLfText(byte[] raw, int length) {
*/
public static boolean isCrLfText(byte[] raw, int length, boolean complete) {
boolean has_crlf = false;
byte last = 'x'; // Just something inconspicuous
for (int ptr = 0; ptr < length; ptr++) {
byte curr = raw[ptr];
if (isBinary(curr, last)) {
int ptr = -1;
byte current;
while (ptr < length - 2) {
current = raw[++ptr];
if (current == '\0') {
return false;
}
if (current == '\r') {
if (raw[++ptr] != '\n') {
return false;
}
if (curr == '\n' && last == '\r') {
has_crlf = true;
}
last = curr;
}
if (last == '\r') {
if (complete) {
// Lone CR: it's binary after all.
if (ptr == length - 2) {
// if '\r' be last, then if isComplete then return binary
current = raw[++ptr];
if('\0' == current || '\r' == current && complete){
return false;
}
// Tough call. If the next byte, which we don't have, would be a
// '\n', it'd be a CR-LF text, otherwise it'd be binary. Just decide
// based on what we already scanned; it wasn't binary until now.
}
return has_crlf;
}
......@@ -578,4 +586,5 @@ public static RawText load(ObjectLoader ldr, int threshold)
return new RawText(data, RawParseUtils.lineMapOrBinary(data, 0, (int) sz));
}
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment