Files
terminal/src/buffer/out/UTextAdapter.cpp
Carlos Zamora 3918332853 Fix detecting partially visible URLs (#19878)
## Summary of the Pull Request
Fixes a bug where a partially visible URL would not be detected. This is
fixed by expanding the search space by 1 viewport height in both
directions.

The `_patternIntervalTree` now operates in the absolute-buffer space as
opposed to the viewport-relative space. It's a bit of an annoying
change, but the alternative would be to keep track of the offset used by
the method above, which I find more annoying, personally. As a part of
this change, I made it a bit more clear when something is
viewport-relative vs buffer-absolute.

Regarding mark mode hyperlink navigation, now that everything is in the
absolute-buffer space, I'm able to fix some of the issues in #13854. I
removed `_selectionIsTargetingUrl` and fixed/validated navigating to
hyperlinks that are partially visible.

## Validation Steps Performed
Detects URL that is...
 fully visible
 partially cropped off the top
 partially cropped off the bottom

 Above scenarios work with mark mode hyperlink navigation

Tests added

Closes #18177
Closes #13854
2026-02-27 11:13:07 -08:00

441 lines
16 KiB
C++

// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#include "precomp.h"
#include "UTextAdapter.h"
#include "textBuffer.hpp"
// All of these are somewhat annoying when trying to implement RefcountBuffer.
// You can't stuff a unique_ptr into ut->q (= void*) after all.
#pragma warning(disable : 26402) // Return a scoped object instead of a heap-allocated if it has a move constructor (r.3).
#pragma warning(disable : 26403) // Reset or explicitly delete an owner<T> pointer '...' (r.3).
#pragma warning(disable : 26409) // Avoid calling new and delete explicitly, use std::make_unique<T> instead (r.11).
struct RowRange
{
til::CoordType begin;
til::CoordType end;
};
struct RefcountBuffer
{
size_t references;
size_t capacity;
wchar_t data[1];
static RefcountBuffer* EnsureCapacityForOverwrite(RefcountBuffer* buffer, size_t capacity)
{
// We must not just ensure that `buffer` has at least `capacity`, but also that its reference count is <= 1, because otherwise we would resize a shared buffer.
if (buffer != nullptr && buffer->references <= 1 && buffer->capacity >= capacity)
{
return buffer;
}
const auto oldCapacity = buffer ? buffer->capacity << 1 : 0;
const auto newCapacity = std::max(capacity + 128, oldCapacity);
const auto newBuffer = static_cast<RefcountBuffer*>(::operator new(sizeof(RefcountBuffer) - sizeof(data) + newCapacity * sizeof(wchar_t)));
if (!newBuffer)
{
return nullptr;
}
if (buffer)
{
buffer->Release();
}
// Copying the old buffer's data is not necessary because utextAccess() will scribble right over it.
newBuffer->references = 1;
newBuffer->capacity = newCapacity;
return newBuffer;
}
void AddRef() noexcept
{
// With our usage patterns, either of these two would indicate
// an unbalanced AddRef/Release or a memory corruption.
assert(references > 0 && references < 1000);
references++;
}
void Release() noexcept
{
// With our usage patterns, either of these two would indicate
// an unbalanced AddRef/Release or a memory corruption.
assert(references > 0 && references < 1000);
if (--references == 0)
{
::operator delete(this);
}
}
};
constexpr size_t& accessLength(UText* ut) noexcept
{
static_assert(sizeof(ut->p) == sizeof(size_t));
return *std::bit_cast<size_t*>(&ut->p);
}
constexpr RefcountBuffer*& accessBuffer(UText* ut) noexcept
{
static_assert(sizeof(ut->q) == sizeof(RefcountBuffer*));
return *std::bit_cast<RefcountBuffer**>(&ut->q);
}
constexpr RowRange& accessRowRange(UText* ut) noexcept
{
static_assert(sizeof(ut->a) == sizeof(RowRange));
return *std::bit_cast<RowRange*>(&ut->a);
}
constexpr til::CoordType& accessCurrentRow(UText* ut) noexcept
{
return ut->b;
}
// An excerpt from the ICU documentation:
//
// Clone a UText. Much like opening a UText where the source text is itself another UText.
//
// A shallow clone replicates only the UText data structures; it does not make
// a copy of the underlying text. Shallow clones can be used as an efficient way to
// have multiple iterators active in a single text string that is not being modified.
//
// A shallow clone operation must not fail except for truly exceptional conditions such
// as memory allocation failures.
//
// @param dest A UText struct to be filled in with the result of the clone operation,
// or NULL if the clone function should heap-allocate a new UText struct.
// @param src The UText to be cloned.
// @param deep true to request a deep clone, false for a shallow clone.
// @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR should
// be returned if the text provider is unable to clone the original text.
// @return The newly created clone, or NULL if the clone operation failed.
static UText* U_CALLCONV utextClone(UText* dest, const UText* src, UBool deep, UErrorCode* status) noexcept
{
__assume(status != nullptr);
if (deep)
{
*status = U_UNSUPPORTED_ERROR;
return dest;
}
dest = utext_setup(dest, 0, status);
if (*status > U_ZERO_ERROR)
{
return dest;
}
memcpy(dest, src, sizeof(UText));
if (const auto buf = accessBuffer(dest))
{
buf->AddRef();
}
return dest;
}
// An excerpt from the ICU documentation:
//
// Gets the length of the text.
//
// @param ut the UText to get the length of.
// @return the length, in the native units of the original text string.
static int64_t U_CALLCONV utextNativeLength(UText* ut) noexcept
try
{
auto length = accessLength(ut);
if (!length)
{
const auto& textBuffer = *static_cast<const TextBuffer*>(ut->context);
const auto range = accessRowRange(ut);
for (til::CoordType y = range.begin; y < range.end; ++y)
{
const auto& row = textBuffer.GetRowByOffset(y);
// Later down below we'll add a newline to the text if !wasWrapForced, so we need to account for that here.
length += row.GetText().size() + !row.WasWrapForced();
}
accessLength(ut) = length;
}
return gsl::narrow_cast<int64_t>(length);
}
catch (...)
{
return 0;
}
// An excerpt from the ICU documentation:
//
// Get the description of the text chunk containing the text at a requested native index.
// The UText's iteration position will be left at the requested index.
// If the index is out of bounds, the iteration position will be left
// at the start or end of the string, as appropriate.
//
// @param ut the UText being accessed.
// @param nativeIndex Requested index of the text to be accessed.
// @param forward If true, then the returned chunk must contain text starting from the index, so that start<=index<limit.
// If false, then the returned chunk must contain text before the index, so that start<index<=limit.
// @return True if the requested index could be accessed. The chunk will contain the requested text.
// False value if a chunk cannot be accessed (the requested index is out of bounds).
static UBool U_CALLCONV utextAccess(UText* ut, int64_t nativeIndex, UBool forward) noexcept
try
{
auto neededIndex = nativeIndex;
// This will make it simpler for us to search the row that contains the nativeIndex,
// because we'll now only need to check for `start<=index<limit` and nothing else.
if (!forward)
{
neededIndex--;
}
const auto& textBuffer = *static_cast<const TextBuffer*>(ut->context);
const auto range = accessRowRange(ut);
const auto startOld = ut->chunkNativeStart;
const auto limitOld = ut->chunkNativeLimit;
auto start = startOld;
auto limit = limitOld;
if (neededIndex < startOld || neededIndex >= limitOld)
{
auto y = accessCurrentRow(ut);
std::wstring_view text;
bool wasWrapForced = false;
if (neededIndex < start)
{
do
{
--y;
if (y < range.begin)
{
break;
}
const auto& row = textBuffer.GetRowByOffset(y);
text = row.GetText();
wasWrapForced = row.WasWrapForced();
limit = start;
// Later down below we'll add a newline to the text if !wasWrapForced, so we need to account for that here.
start -= text.size() + !wasWrapForced;
} while (neededIndex < start);
}
else
{
do
{
++y;
if (y >= range.end)
{
break;
}
const auto& row = textBuffer.GetRowByOffset(y);
text = row.GetText();
wasWrapForced = row.WasWrapForced();
start = limit;
// Later down below we'll add a newline to the text if !wasWrapForced, so we need to account for that here.
limit += text.size() + !wasWrapForced;
} while (neededIndex >= limit);
}
assert(start >= 0);
// If we have already calculated the total length we can also assert that the limit is in range.
assert(ut->p == nullptr || static_cast<size_t>(limit) <= accessLength(ut));
// Even if we went out-of-bounds, we still need to update the chunkContents to contain the first/last chunk.
if (limit != limitOld)
{
if (!wasWrapForced)
{
const auto newSize = text.size() + 1;
const auto buffer = RefcountBuffer::EnsureCapacityForOverwrite(accessBuffer(ut), newSize);
memcpy(&buffer->data[0], text.data(), text.size() * sizeof(wchar_t));
til::at(buffer->data, text.size()) = L'\n';
text = { &buffer->data[0], newSize };
accessBuffer(ut) = buffer;
}
accessCurrentRow(ut) = y;
ut->chunkNativeStart = start;
ut->chunkNativeLimit = limit;
ut->chunkLength = gsl::narrow_cast<int32_t>(text.size());
#pragma warning(suppress : 26490) // Don't use reinterpret_cast (type.1).
ut->chunkContents = reinterpret_cast<const char16_t*>(text.data());
ut->nativeIndexingLimit = ut->chunkLength;
}
}
// The ICU documentation is a little bit misleading. It states:
// > @param forward [...] If true, start<=index<limit. If false, [...] start<index<=limit.
// but that's just for finding the target chunk. The chunkOffset is not actually constrained to that!
// std::clamp will perform a<=b<=c, which is what we want.
const auto clampedIndex = std::clamp(nativeIndex, start, limit);
auto offset = gsl::narrow_cast<int32_t>(clampedIndex - start);
// Don't leave the offset on a trailing surrogate pair. See U16_SET_CP_START.
// This assumes that the TextBuffer contains valid UTF-16 which may theoretically not be the case.
if (offset > 0 && offset < ut->chunkLength && U16_IS_TRAIL(til::at(ut->chunkContents, offset)))
{
offset--;
}
ut->chunkOffset = offset;
return neededIndex >= start && neededIndex < limit;
}
catch (...)
{
return false;
}
// An excerpt from the ICU documentation:
//
// Extract text from a UText into a UChar buffer.
// The size (number of 16 bit UChars) in the data to be extracted is returned.
// The full amount is returned, even when the specified buffer size is smaller.
// The extracted string must be NUL-terminated if there is sufficient space in the destination buffer.
//
// @param ut the UText from which to extract data.
// @param nativeStart the native index of the first character to extract.
// @param nativeLimit the native string index of the position following the last character to extract.
// @param dest the UChar (UTF-16) buffer into which the extracted text is placed
// @param destCapacity The size, in UChars, of the destination buffer. May be zero for precomputing the required size.
// @param status receives any error status. If U_BUFFER_OVERFLOW_ERROR: Returns number of UChars for preflighting.
// @return Number of UChars in the data. Does not include a trailing NUL.
//
// NOTE: utextExtract's correctness hasn't been verified yet. The code remains, just incase its functionality is needed in the future.
#pragma warning(suppress : 4505) // 'utextExtract': unreferenced function with internal linkage has been removed
static int32_t U_CALLCONV utextExtract(UText* ut, int64_t nativeStart, int64_t nativeLimit, char16_t* dest, int32_t destCapacity, UErrorCode* status) noexcept
try
{
__assume(status != nullptr);
if (*status > U_ZERO_ERROR)
{
return 0;
}
if (destCapacity < 0 || (dest == nullptr && destCapacity > 0) || nativeStart > nativeLimit)
{
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if (!utextAccess(ut, nativeStart, true))
{
return 0;
}
nativeLimit = std::min(ut->chunkNativeLimit, nativeLimit);
if (destCapacity <= 0)
{
return gsl::narrow_cast<int32_t>(nativeLimit - nativeStart);
}
const auto& textBuffer = *static_cast<const TextBuffer*>(ut->context);
const auto y = accessCurrentRow(ut);
const auto offset = ut->chunkNativeStart - nativeStart;
const auto text = textBuffer.GetRowByOffset(y).GetText().substr(gsl::narrow_cast<size_t>(std::max<int64_t>(0, offset)));
const auto destCapacitySizeT = gsl::narrow_cast<size_t>(destCapacity);
const auto length = std::min(destCapacitySizeT, text.size());
memcpy(dest, text.data(), length * sizeof(char16_t));
if (length < destCapacitySizeT)
{
#pragma warning(suppress : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1).
dest[length] = 0;
}
return gsl::narrow_cast<int32_t>(length);
}
catch (...)
{
// The only thing that can fail is GetRowByOffset() which in turn can only fail when VirtualAlloc() fails.
*status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
static void U_CALLCONV utextClose(UText* ut) noexcept
{
if (const auto buffer = accessBuffer(ut))
{
buffer->Release();
}
}
static constexpr UTextFuncs utextFuncs{
.tableSize = sizeof(UTextFuncs),
.clone = utextClone,
.nativeLength = utextNativeLength,
.access = utextAccess,
.close = utextClose,
};
// Creates a UText from the given TextBuffer that spans rows [rowBeg,RowEnd).
Microsoft::Console::ICU::unique_utext Microsoft::Console::ICU::UTextFromTextBuffer(const TextBuffer& textBuffer, til::CoordType rowBeg, til::CoordType rowEnd) noexcept
{
#pragma warning(suppress : 26477) // Use 'nullptr' rather than 0 or NULL (es.47).
unique_utext ut{ UTEXT_INITIALIZER };
UErrorCode status = U_ZERO_ERROR;
utext_setup(&ut, 0, &status);
FAIL_FAST_IF(status > U_ZERO_ERROR);
rowBeg = std::max<til::CoordType>(0, rowBeg);
rowEnd = std::min(textBuffer.GetSize().BottomExclusive(), rowEnd);
ut.providerProperties = (1 << UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE) | (1 << UTEXT_PROVIDER_STABLE_CHUNKS);
ut.pFuncs = &utextFuncs;
ut.context = &textBuffer;
accessCurrentRow(&ut) = rowBeg - 1; // the utextAccess() below will advance this by 1.
accessRowRange(&ut) = { rowBeg, rowEnd };
utextAccess(&ut, 0, true);
return ut;
}
// Returns a half-open [beg,end) range given a text start and end position.
// This function is designed to be used with uregex_start64/uregex_end64.
til::point_span Microsoft::Console::ICU::BufferRangeFromMatch(UText* ut, URegularExpression* re)
{
UErrorCode status = U_ZERO_ERROR;
const auto nativeIndexBeg = uregex_start64(re, 0, &status);
const auto nativeIndexEnd = uregex_end64(re, 0, &status);
const auto& textBuffer = *static_cast<const TextBuffer*>(ut->context);
til::point_span ret;
if (utextAccess(ut, nativeIndexBeg, true))
{
const auto y = accessCurrentRow(ut);
ret.start.x = textBuffer.GetRowByOffset(y).GetLeadingColumnAtCharOffset(ut->chunkOffset);
ret.start.y = y;
}
else
{
ret.start.y = accessRowRange(ut).begin;
}
if (utextAccess(ut, nativeIndexEnd, true))
{
const auto y = accessCurrentRow(ut);
ret.end.x = textBuffer.GetRowByOffset(y).GetLeadingColumnAtCharOffset(ut->chunkOffset);
ret.end.y = y;
}
else
{
ret.end = ret.start;
}
return ret;
}