terraform/lang/langserver/source.go

package langserver

import (
	"bufio"
	"unicode/utf16"
	"unicode/utf8"

	"github.com/apparentlymart/go-textseg/textseg"
	"github.com/hashicorp/hcl/v2"

	lsp "github.com/hashicorp/terraform/internal/lsp"
)

type sourceLine struct {
	content []byte
	rng     hcl.Range
}

type sourceLines []sourceLine

func makeSourceLines(filename string, s []byte) sourceLines {
	var ret sourceLines
	sc := hcl.NewRangeScanner(s, filename, bufio.ScanLines)
	for sc.Scan() {
		ret = append(ret, sourceLine{
			content: sc.Bytes(),
			rng:     sc.Range(),
		})
	}
	if len(ret) == 0 {
		ret = append(ret, sourceLine{
			content: nil,
			rng: hcl.Range{
				Filename: filename,
				Start:    hcl.Pos{Line: 1, Column: 1},
				End:      hcl.Pos{Line: 1, Column: 1},
			},
		})
	}
	return ret
}

// rangeHCLToLSP converts a range in HCL's representation to the equivalent
// position in LSP's.
func (ls sourceLines) rangeHCLToLSP(in hcl.Range) lsp.Range {
	return lsp.Range{
		Start: ls.posHCLToLSP(in.Start),
		End:   ls.posHCLToLSP(in.End),
	}
}

// posLSPToHCL converts a position in LSP's representation into the equivalent
// position in HCL's representation. If the given position is not within the
// content then the result is undefined.
func (ls sourceLines) posLSPToHCL(in lsp.Position) hcl.Pos {
	if len(ls) == 0 {
		return hcl.Pos{Line: 1, Column: 1, Byte: 0}
	}
	lspLine := int(in.Line)
	if lspLine >= len(ls) {
		return ls[len(ls)-1].rng.End
	}
	if lspLine < 0 {
		return ls[0].rng.Start
	}

	l := ls[lspLine]
	return l.posForLSPColumn(in.Character)
}

// posHCLToLSP converts a position in HCL's representation into the equivalent
// position in the LSP's representation. If the given position is not within
// the content then the result is undefined. The result is also undefined
// if the given range has a Byte value inconsistent with its Line and Column
// values.
func (ls sourceLines) posHCLToLSP(in hcl.Pos) lsp.Position {
	if len(ls) == 0 {
		return lsp.Position{Line: 0, Character: 0}
	}
	if (in.Line - 1) >= len(ls) {
		in = ls[len(ls)-1].rng.End
	} else if in.Line < 1 {
		in = ls[0].rng.Start
	}

	l := ls[in.Line-1]
	return lsp.Position{
		Line:      float64(in.Line - 1),
		Character: l.lspColumnForPos(in),
	}
}

// posLSPtoByte converts a position in LSP's representation into a byte
// offset into the full source buffer. If the given position is not within
// the content then the result is undefined. This is different than the
// byte offset returned in posLSPToHCL because it can potentially point into
// the middle of a grapheme cluster. It will produce an incorrect result if
// given a position referring to the second unit of a utf-16 surrogate pair.
//
// This should NOT be used to process incoming text change requests from the
// LSP client because the user may type into the middle of a surrogate pair
// and the rounding behavior of this method would cause the first unit
// to be lost, causing us to get out of sync with the client.
func (ls sourceLines) posLSPToByte(in lsp.Position) int {
	if len(ls) == 0 {
		return 0
	}
	lspLine := int(in.Line)
	if lspLine >= len(ls) {
		return ls[len(ls)-1].rng.End.Byte
	}
	if lspLine < 0 {
		return ls[0].rng.Start.Byte
	}

	l := ls[lspLine]
	return l.byteForLSPColumn(in.Character)
}

// allASCII returns true if the receiver is provably all ASCII, which allows
// for some fast paths where we can treat columns and bytes as equivalent.
func (l sourceLine) allASCII() bool {
	// If we have the same number of columns as bytes then our content is
	// all ASCII, since it clearly contains no multi-byte grapheme clusters.
	bytes := l.rng.End.Byte - l.rng.Start.Byte
	columns := l.rng.End.Column - l.rng.Start.Column
	return bytes == columns
}

// lspLen returns the length of the content of this line in characters as the
// LSP thinks of them, which is by counting how many code units would represent
// this string in UTF-16.
func (l sourceLine) lspLen() int {
	if l.allASCII() {
		// Easy path: length is the byte length
		return len(l.content)
	}

	chars := 0
	remain := l.content
	for len(remain) > 0 {
		r, l := utf8.DecodeRune(remain)
		remain = remain[l:]
		if r1, r2 := utf16.EncodeRune(r); r1 == 0xfffd && r2 == 0xfffd {
			chars++ // only one code unit needed for this one
		} else {
			chars += 2 // needs a surrogate pair
		}
	}
	return chars
}

// posForLSPColumn takes an lsp.Position.Character value for the receving line
// and finds the equivalent hcl.Pos for it.
func (l sourceLine) posForLSPColumn(lspCol float64) hcl.Pos {
	inCol := int(lspCol)
	if inCol < 0 {
		return l.rng.Start
	}

	// Easy path: if the entire line is ASCII then column counts are equivalent
	// in LSP vs. HCL aside from zero- vs. one-based counting.
	if l.allASCII() {
		return hcl.Pos{
			Line:   l.rng.Start.Line,
			Column: inCol + 1,
			Byte:   l.rng.Start.Byte + inCol,
		}
	}

	// If there are non-ASCII characters then we need to edge carefully
	// along the line while counting UTF-16 code units in our UTF-8 buffer,
	// since LSP columns are a count of UTF-16 units.
	byteCt := 0
	utf16Ct := 0
	colIdx := 1
	remain := l.content
	for {
		if len(remain) == 0 { // ran out of characters on the line, so given column is invalid
			return l.rng.End
		}
		if utf16Ct >= inCol { // we've found it
			return hcl.Pos{
				Line:   l.rng.Start.Line,
				Column: colIdx,
				Byte:   l.rng.Start.Byte + byteCt,
			}
		}
		adv, chBytes, _ := textseg.ScanGraphemeClusters(remain, true)
		remain = remain[adv:]
		byteCt += adv
		colIdx++
		for len(chBytes) > 0 {
			r, l := utf8.DecodeRune(chBytes)
			chBytes = chBytes[l:]
			c1, c2 := utf16.EncodeRune(r)
			if c1 == 0xfffd && c2 == 0xfffd {
				utf16Ct++ // codepoint fits in one 16-bit unit
			} else {
				utf16Ct += 2 // codepoint requires a surrogate pair
			}
		}
	}
}

// lspColumnForPos takes a hcl.Pos that must be within the receving line
// and returns its corresponding LSP column offset within the same line.
func (l sourceLine) lspColumnForPos(pos hcl.Pos) float64 {
	if pos.Column < l.rng.Start.Column || pos.Byte < l.rng.Start.Byte {
		return float64(l.rng.Start.Column - 1)
	} else if pos.Column > l.rng.End.Column || pos.Byte > l.rng.End.Byte {
		return float64(l.rng.End.Column - 1)
	}

	// Easy path: if the entire line is ASCII then column counts are equivalent
	// in LSP vs. HCL aside from zero- vs. one-based counting.
	if l.allASCII() {
		return float64(pos.Column - 1)
	}

	// If there are non-ASCII characters then we need to edge carefully
	// along the line while counting UTF-16 code units in our UTF-8 buffer,
	// since LSP columns are a count of UTF-16 units.
	utf16Ct := 0
	colIdx := 1
	remain := l.content
	for {
		if len(remain) == 0 { // ran out of characters on the line, so given position is invalid
			return float64(l.rng.End.Column - 1)
		}
		if colIdx >= pos.Column { // we've found it
			return float64(utf16Ct)
		}
		adv, chBytes, _ := textseg.ScanGraphemeClusters(remain, true)
		remain = remain[adv:]
		colIdx++
		for len(chBytes) > 0 {
			r, l := utf8.DecodeRune(chBytes)
			chBytes = chBytes[l:]
			c1, c2 := utf16.EncodeRune(r)
			if c1 == 0xfffd && c2 == 0xfffd {
				utf16Ct++ // codepoint fits in one 16-bit unit
			} else {
				utf16Ct += 2 // codepoint requires a surrogate pair
			}
		}
	}
}

// byteForLSPColumn takes an lsp.Position.Character value for the receving line
// and finds the byte offset of the start of the UTF-8 sequence that represents
// it in the overall source buffer. This is different than the byte returned
// by posForLSPColumn because it can return offsets that are partway through
// a grapheme cluster, while HCL positions always round to the nearest
// grapheme cluster.
//
// Note that even this can't produce an exact result; if the column index
// refers to the second unit of a UTF-16 surrogate pair then it is rounded
// down the first unit because UTF-8 sequences are not divisible in the same
// way.
func (l sourceLine) byteForLSPColumn(lspCol float64) int {
	inCol := int(lspCol)
	if inCol < 0 {
		return l.rng.Start.Byte
	}

	// Easy path: if the entire line is ASCII then column counts are equivalent
	// in LSP vs. HCL aside from zero- vs. one-based counting.
	if l.allASCII() {
		return l.rng.Start.Byte + inCol
	}

	// If there are non-ASCII characters then we need to edge carefully
	// along the line while counting UTF-16 code units in our UTF-8 buffer,
	// since LSP columns are a count of UTF-16 units.
	byteCt := 0
	utf16Ct := 0
	colIdx := 1
	remain := l.content
	for {
		if len(remain) == 0 { // ran out of characters on the line, so given column is invalid
			return l.rng.End.Byte
		}
		if utf16Ct >= inCol { // we've found it
			return l.rng.Start.Byte + byteCt
		}
		// Unlike our other conversion functions we're intentionally using
		// individual UTF-8 sequences here rather than grapheme clusters because
		// an LSP position might point into the middle of a grapheme cluster.

		adv, chBytes, _ := textseg.ScanUTF8Sequences(remain, true)
		remain = remain[adv:]
		byteCt += adv
		colIdx++
		for len(chBytes) > 0 {
			r, l := utf8.DecodeRune(chBytes)
			chBytes = chBytes[l:]
			c1, c2 := utf16.EncodeRune(r)
			if c1 == 0xfffd && c2 == 0xfffd {
				utf16Ct++ // codepoint fits in one 16-bit unit
			} else {
				utf16Ct += 2 // codepoint requires a surrogate pair
			}
		}
	}
}