From d528e1af75e5b253b21ec90fa9d5f33737fd4909 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 13 Jun 2024 11:39:01 -0700 Subject: [PATCH] fix utf16 for multibyte runes --- parser/parser.go | 35 ++++++----------------------------- 1 file changed, 6 insertions(+), 29 deletions(-) diff --git a/parser/parser.go b/parser/parser.go index 4f44f6af..686a1e69 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -8,7 +8,9 @@ import ( "io" "strconv" "strings" - "unicode" + + "golang.org/x/text/encoding/unicode" + "golang.org/x/text/transform" ) type File struct { @@ -69,14 +71,11 @@ func ParseFile(r io.Reader) (*File, error) { var b bytes.Buffer var role string - var lineCount int - var linePos int - - var utf16 bool - var f File - br := bufio.NewReader(r) + tr := unicode.BOMOverride(unicode.UTF8.NewDecoder()) + br := bufio.NewReader(transform.NewReader(r, tr)) + for { r, _, err := br.ReadRune() if errors.Is(err, io.EOF) { @@ -85,17 +84,6 @@ func ParseFile(r io.Reader) (*File, error) { return nil, err } - // the utf16 byte order mark will be read as "unreadable" by ReadRune() - if isUnreadable(r) && lineCount == 0 && linePos == 0 { - utf16 = true - continue - } - - // skip the second byte if we're reading utf16 - if utf16 && r == 0 { - continue - } - next, r, err := parseRuneForState(r, curr) if errors.Is(err, io.ErrUnexpectedEOF) { return nil, fmt.Errorf("%w: %s", err, b.String()) @@ -103,13 +91,6 @@ func ParseFile(r io.Reader) (*File, error) { return nil, err } - if isNewline(r) { - lineCount++ - linePos = 0 - } else { - linePos++ - } - // process the state transition, some transitions need to be intercepted and redirected if next != curr { switch curr { @@ -309,10 +290,6 @@ func isNewline(r rune) bool { return r == '\r' || r == '\n' } -func isUnreadable(r rune) bool { - return r == unicode.ReplacementChar -} - func isValidMessageRole(role string) bool { return role == "system" || role == "user" || role == "assistant" }