Improve sixel performance (#19639)

Together with #19640 this bumps the performance on my HW by around 60%.
You can now stream 4K images at >60 FPS on a high-end CPU.

## Validation Steps Performed
* Used imagemagick to dump large images 
This commit is contained in:
Leonard Hecker
2026-01-12 14:05:33 +01:00
committed by GitHub
parent 64caebecd3
commit c0debf8e26
3 changed files with 80 additions and 18 deletions

View File

@@ -1017,6 +1017,7 @@ minkernel
MINMAXINFO
minwin
minwindef
misprediction
MMBB
mmcc
MMCPL

View File

@@ -729,6 +729,11 @@ void SixelParser::_decreaseFilledBackgroundHeight(const int decreasedHeight) noe
}
}
#pragma warning(push)
#pragma warning(disable : 26429) // Symbol 'imageBufferPtr' is never tested for nullness, it can be marked as not_null (f.23).
#pragma warning(disable : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1).
#pragma warning(disable : 26490) // Don't use reinterpret_cast (type.1).
void SixelParser::_writeToImageBuffer(int sixelValue, int repeatCount)
{
// On terminals that support the raster attributes command (which sets the
@@ -736,32 +741,88 @@ void SixelParser::_writeToImageBuffer(int sixelValue, int repeatCount)
// is received. So if we haven't filled it yet, we need to do so now.
_fillImageBackground();
repeatCount = std::min(repeatCount, _imageMaxWidth - _imageCursor.x);
if (repeatCount <= 0)
{
return;
}
if (sixelValue == 0)
{
_imageCursor.x += repeatCount;
return;
}
// This allows us to unsafely cast _imageBuffer to uint16_t
// and benefit from compiler/STL optimizations.
static_assert(sizeof(IndexedPixel) == sizeof(int16_t));
static_assert(alignof(IndexedPixel) == alignof(int16_t));
// Then we need to render the 6 vertical pixels that are represented by the
// bits in the sixel value. Although note that each of these sixel pixels
// may cover more than one device pixel, depending on the aspect ratio.
const auto targetOffset = _imageCursor.y * _imageMaxWidth + _imageCursor.x;
auto imageBufferPtr = std::next(_imageBuffer.data(), targetOffset);
repeatCount = std::min(repeatCount, _imageMaxWidth - _imageCursor.x);
for (auto i = 0; i < 6; i++)
const auto foreground = std::bit_cast<int16_t>(_foregroundPixel);
auto imageBufferPtr = reinterpret_cast<int16_t*>(_imageBuffer.data() + targetOffset);
// A aspect ratio of 1:1 is the most common and worth optimizing.
if (_pixelAspectRatio == 1)
{
if (sixelValue & 1)
do
{
auto repeatAspectRatio = _pixelAspectRatio;
do
// This gets unrolled by MSVC. It's written this way to use CMOV instructions.
// Modern CPUs have fat caches and deep pipelines. It's better to do pointless reads
// from memory than causing branch misprediction, as sixelValue is highly random.
for (int i = 0; i < 6; i++)
{
std::fill_n(imageBufferPtr, repeatCount, _foregroundPixel);
std::advance(imageBufferPtr, _imageMaxWidth);
} while (--repeatAspectRatio > 0);
}
else
{
std::advance(imageBufferPtr, _imageMaxWidth * _pixelAspectRatio);
}
sixelValue >>= 1;
const auto test = sixelValue & (1 << i);
// Possibly pointless read from memory, but...
const auto before = imageBufferPtr[i * _imageMaxWidth];
// ...it allows the compiler to turn this into a CMOV (= no branch misprediction).
const auto after = test ? foreground : before;
imageBufferPtr[i * _imageMaxWidth] = after;
}
_imageCursor.x += 1;
imageBufferPtr += 1;
repeatCount -= 1;
} while (repeatCount > 0);
}
else
{
for (auto i = 0; i < 6; i++)
{
if (sixelValue & 1)
{
auto repeatAspectRatio = _pixelAspectRatio;
do
{
// If this used std::fill_n or just a primitive loop, MSVC would compile
// it to a `rep stosw` instruction, which has a high startup cost.
// This is not ideal when our repeatCount is almost always small.
// The way this does ptr++ and len-- is also ideal for optimization.
auto ptr = imageBufferPtr;
auto remaining = repeatCount;
do
{
__iso_volatile_store16(ptr++, foreground);
} while (--remaining != 0);
imageBufferPtr += _imageMaxWidth;
} while (--repeatAspectRatio > 0);
}
else
{
std::advance(imageBufferPtr, _imageMaxWidth * _pixelAspectRatio);
}
sixelValue >>= 1;
}
_imageCursor.x += repeatCount;
}
_imageCursor.x += repeatCount;
}
#pragma warning(pop)
void SixelParser::_eraseImageBufferRows(const int rowCount, const til::CoordType rowOffset) noexcept
{
const auto pixelCount = rowCount * _cellSize.height;

View File

@@ -42,7 +42,7 @@ namespace Microsoft::Console::VirtualTerminal
// to retain the 16-bit size.
static constexpr size_t MAX_COLORS = 256;
using IndexType = uint8_t;
struct IndexedPixel
struct alignas(int16_t) IndexedPixel
{
uint8_t transparent = false;
IndexType colorIndex = 0;
@@ -104,8 +104,8 @@ namespace Microsoft::Console::VirtualTerminal
const size_t _maxColors;
size_t _colorsUsed = 0;
size_t _colorsAvailable = 0;
bool _colorTableChanged = false;
IndexedPixel _foregroundPixel = {};
bool _colorTableChanged = false;
void _initImageBuffer();
void _resizeImageBuffer(const til::CoordType requiredHeight);