Merge pull request #45 from slingamn/utf8_truncation.1

ircmsg: make truncation utf8-safe
This commit is contained in:
Shivaram Lingamneni 2021-03-03 14:23:43 -05:00 committed by GitHub
commit 78fec0a07e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 201 additions and 57 deletions

@ -9,6 +9,7 @@ import (
"bytes"
"errors"
"strings"
"unicode/utf8"
)
const (
@ -34,17 +35,30 @@ const (
var (
// ErrorLineIsEmpty indicates that the given IRC line was empty.
ErrorLineIsEmpty = errors.New("Line is empty")
// ErrorLineContainsBadChar indicates that the line contained invalid characters
ErrorLineContainsBadChar = errors.New("Line contains invalid characters")
// ErrorLineTooLong indicates that the message exceeded the maximum tag length
// (the name references 417 ERR_INPUTTOOLONG; we reserve the right to return it
// for messages that exceed the non-tag length limit)
ErrorLineTooLong = errors.New("Line could not be parsed because a specified length limit was exceeded")
// ErrorBodyTooLong indicates that the message body exceeded the specified
// length limit (typically 512 bytes). This error is non-fatal; if encountered
// when parsing a message, the message is parsed up to the length limit, and
// if encountered when serializing a message, the message is truncated to the limit.
ErrorBodyTooLong = errors.New("Line body exceeded the specified length limit; outgoing messages will be truncated")
// ErrorTagsTooLong indicates that the message exceeded the maximum tag length
// (the specified response on the server side is 417 ERR_INPUTTOOLONG).
ErrorTagsTooLong = errors.New("Line could not be processed because its tag data exceeded the length limit")
// ErrorInvalidTagContent indicates that a tag name or value was invalid
ErrorInvalidTagContent = errors.New("Line could not be processed because it contained an invalid tag name or value")
// ErrorCommandMissing indicates that an IRC message was invalid because it lacked a command.
ErrorCommandMissing = errors.New("IRC messages MUST have a command")
ErrorBadParam = errors.New("Cannot have an empty param, a param with spaces, or a param that starts with ':' before the last parameter")
// ErrorBadParam indicates that an IRC message could not be serialized because
// its parameters violated the syntactic constraints on IRC parameters:
// non-final parameters cannot be empty, contain a space, or start with `:`.
ErrorBadParam = errors.New("Cannot have an empty param, a param with spaces, or a param that starts with ':' before the last parameter")
)
// IRCMessage represents an IRC message, as defined by the RFCs and as
@ -148,28 +162,36 @@ func ParseLine(line string) (ircmsg IRCMessage, err error) {
// ParseLineStrict creates and returns an IRCMessage from the given IRC line,
// taking the maximum length into account and truncating the message as appropriate.
// If fromClient is true, it enforces the client limit on tag data length (4094 bytes),
// allowing the server to return ERR_INPUTTOOLONG as appropriate. If truncateLen is
// allowing the server to return ERR_INPUTTOOLONG as appropriate. If maxLenBody is
// nonzero, it is the length at which the non-tag portion of the message is truncated.
func ParseLineStrict(line string, fromClient bool, truncateLen int) (ircmsg IRCMessage, err error) {
func ParseLineStrict(line string, fromClient bool, maxLenBody int) (ircmsg IRCMessage, err error) {
maxTagDataLength := MaxlenTagData
if fromClient {
maxTagDataLength = MaxlenClientTagData
}
return parseLine(line, maxTagDataLength, truncateLen)
return parseLine(line, maxTagDataLength, maxLenBody)
}
// slice off any amount of ' ' from the front of the string
func trimInitialSpaces(str string) string {
var i int
for i = 0; i < len(str) && str[i] == ' '; i += 1 {
for i = 0; i < len(str) && str[i] == ' '; i++ {
}
return str[i:]
}
func parseLine(line string, maxTagDataLength int, truncateLen int) (ircmsg IRCMessage, err error) {
func parseLine(line string, maxTagDataLength int, maxLenBody int) (ircmsg IRCMessage, err error) {
// remove either \n or \r\n from the end of the line:
line = strings.TrimSuffix(line, "\n")
line = strings.TrimSuffix(line, "\r")
// whether we removed them ourselves, or whether they were removed previously,
// they count against the line limit:
if maxLenBody != 0 {
if maxLenBody <= 2 {
return ircmsg, ErrorLineIsEmpty
}
maxLenBody -= 2
}
// now validate for the 3 forbidden bytes:
if strings.IndexByte(line, '\x00') != -1 || strings.IndexByte(line, '\n') != -1 || strings.IndexByte(line, '\r') != -1 {
return ircmsg, ErrorLineContainsBadChar
@ -187,7 +209,7 @@ func parseLine(line string, maxTagDataLength int, truncateLen int) (ircmsg IRCMe
}
tags := line[1:tagEnd]
if 0 < maxTagDataLength && maxTagDataLength < len(tags) {
return ircmsg, ErrorLineTooLong
return ircmsg, ErrorTagsTooLong
}
err = ircmsg.parseTags(tags)
if err != nil {
@ -198,8 +220,8 @@ func parseLine(line string, maxTagDataLength int, truncateLen int) (ircmsg IRCMe
}
// truncate if desired
if 0 < truncateLen && truncateLen < len(line) {
line = line[:truncateLen]
if maxLenBody != 0 && maxLenBody < len(line) {
err = ErrorBodyTooLong
}
// modern: "These message parts, and parameters themselves, are separated
@ -252,7 +274,7 @@ func parseLine(line string, maxTagDataLength int, truncateLen int) (ircmsg IRCMe
line = line[paramEnd+1:]
}
return ircmsg, nil
return ircmsg, err
}
// helper to parse tags
@ -337,8 +359,8 @@ func paramRequiresTrailing(param string) bool {
}
// line returns a sendable line created from an IRCMessage.
func (ircmsg *IRCMessage) line(tagLimit, clientOnlyTagDataLimit, serverAddedTagDataLimit, truncateLen int) ([]byte, error) {
if len(ircmsg.Command) < 1 {
func (ircmsg *IRCMessage) line(tagLimit, clientOnlyTagDataLimit, serverAddedTagDataLimit, truncateLen int) (result []byte, err error) {
if len(ircmsg.Command) == 0 {
return nil, ErrorCommandMissing
}
@ -382,10 +404,10 @@ func (ircmsg *IRCMessage) line(tagLimit, clientOnlyTagDataLimit, serverAddedTagD
lenTags = buf.Len()
if 0 < tagLimit && tagLimit < buf.Len() {
return nil, ErrorLineTooLong
return nil, ErrorTagsTooLong
}
if (0 < clientOnlyTagDataLimit && clientOnlyTagDataLimit < lenClientOnlyTags) || (0 < serverAddedTagDataLimit && serverAddedTagDataLimit < lenRegularTags) {
return nil, ErrorLineTooLong
return nil, ErrorTagsTooLong
}
if len(ircmsg.Prefix) > 0 {
@ -408,18 +430,33 @@ func (ircmsg *IRCMessage) line(tagLimit, clientOnlyTagDataLimit, serverAddedTagD
buf.WriteString(param)
}
// truncate if desired
// -2 for \r\n
restLen := buf.Len() - lenTags
if 0 < truncateLen && (truncateLen-2) < restLen {
buf.Truncate(lenTags + (truncateLen - 2))
// truncate if desired; leave 2 bytes over for \r\n:
if truncateLen != 0 && (truncateLen-2) < (buf.Len()-lenTags) {
err = ErrorBodyTooLong
newBufLen := lenTags + (truncateLen - 2)
buf.Truncate(newBufLen)
// XXX: we may have truncated in the middle of a UTF8-encoded codepoint;
// if so, remove additional bytes, stopping when the sequence either
// ends in a valid codepoint, or we have removed 3 bytes (the maximum
// length of the remnant of a once-valid, truncated codepoint; we don't
// want to truncate the entire message if it wasn't UTF8 in the first
// place).
for i := 0; i < (utf8.UTFMax - 1); i++ {
r, n := utf8.DecodeLastRune(buf.Bytes())
if r == utf8.RuneError && n <= 1 {
newBufLen--
buf.Truncate(newBufLen)
} else {
break
}
}
}
buf.WriteString("\r\n")
result := buf.Bytes()
result = buf.Bytes()
toValidate := result[:len(result)-2]
if bytes.IndexByte(toValidate, '\x00') != -1 || bytes.IndexByte(toValidate, '\r') != -1 || bytes.IndexByte(toValidate, '\n') != -1 {
return nil, ErrorLineContainsBadChar
}
return result, nil
return result, err
}

@ -1,9 +1,12 @@
package ircmsg
import (
"bytes"
"fmt"
"reflect"
"strings"
"testing"
"unicode/utf8"
)
type testcode struct {
@ -11,30 +14,29 @@ type testcode struct {
message IRCMessage
}
type testcodewithlen struct {
raw string
length int
message IRCMessage
raw string
length int
message IRCMessage
truncateExpected bool
}
var decodelentests = []testcodewithlen{
{":dan-!d@localhost PRIVMSG dan #test :What a cool message\r\n", 20,
MakeMessage(nil, "dan-!d@localhost", "PR")},
{":dan-!d@localhost PRIVMSG dan #test :What a cool message\r\n", 22,
MakeMessage(nil, "dan-!d@localhost", "PRIVMSG", "dan", "#test", "What a cool message"), true},
{"@time=12732;re TEST *\r\n", 512,
MakeMessage(map[string]string{"time": "12732", "re": ""}, "", "TEST", "*")},
MakeMessage(map[string]string{"time": "12732", "re": ""}, "", "TEST", "*"), false},
{"@time=12732;re TEST *\r\n", 512,
MakeMessage(map[string]string{"time": "12732", "re": ""}, "", "TEST", "*")},
MakeMessage(map[string]string{"time": "12732", "re": ""}, "", "TEST", "*"), false},
{":dan- TESTMSG\r\n", 2048,
MakeMessage(nil, "dan-", "TESTMSG")},
{":dan- TESTMSG dan \r\n", 12,
MakeMessage(nil, "dan-", "TESTMS")},
MakeMessage(nil, "dan-", "TESTMSG"), false},
{"TESTMSG\r\n", 6,
MakeMessage(nil, "", "TESTMS")},
MakeMessage(nil, "", "TESTMSG"), true},
{"TESTMSG\r\n", 7,
MakeMessage(nil, "", "TESTMSG")},
MakeMessage(nil, "", "TESTMSG"), true},
{"TESTMSG\r\n", 8,
MakeMessage(nil, "", "TESTMSG")},
MakeMessage(nil, "", "TESTMSG"), true},
{"TESTMSG\r\n", 9,
MakeMessage(nil, "", "TESTMSG")},
MakeMessage(nil, "", "TESTMSG"), false},
}
// map[string]string{"time": "12732", "re": ""}
@ -100,15 +102,22 @@ var decodetesterrors = []testparseerror{
{"privmsg #channel :command injection attempt \r:Nickserv PRIVMSG user :Please re-enter your password", ErrorLineContainsBadChar},
}
func validateTruncateError(pair testcodewithlen, err error, t *testing.T) {
if pair.truncateExpected {
if err != ErrorBodyTooLong {
t.Error("For", pair.raw, "expected truncation, but got error", err)
}
} else {
if err != nil {
t.Error("For", pair.raw, "expected no error, but got", err)
}
}
}
func TestDecode(t *testing.T) {
for _, pair := range decodelentests {
ircmsg, err := ParseLineStrict(pair.raw, true, pair.length)
if err != nil {
t.Error(
"For", pair.raw,
"Failed to parse line:", err,
)
}
validateTruncateError(pair, err, t)
if !reflect.DeepEqual(ircmsg, pair.message) {
t.Error(
@ -159,11 +168,11 @@ var encodetests = []testcode{
}
var encodelentests = []testcodewithlen{
{":dan-!d@lo\r\n", 12,
MakeMessage(nil, "dan-!d@localhost", "PRIVMSG", "dan", "#test", "What a cool message")},
MakeMessage(nil, "dan-!d@localhost", "PRIVMSG", "dan", "#test", "What a cool message"), true},
{"@time=12732 TEST *\r\n", 52,
MakeMessage(map[string]string{"time": "12732"}, "", "TEST", "*")},
MakeMessage(map[string]string{"time": "12732"}, "", "TEST", "*"), false},
{"@riohwihowihirgowihre TEST *\r\n", 8,
MakeMessage(map[string]string{"riohwihowihirgowihre": ""}, "", "TEST", "*", "*")},
MakeMessage(map[string]string{"riohwihowihirgowihre": ""}, "", "TEST", "*", "*"), true},
}
func TestEncode(t *testing.T) {
@ -203,12 +212,7 @@ func TestEncode(t *testing.T) {
}
for _, pair := range encodelentests {
line, err := pair.message.LineBytesStrict(true, pair.length)
if err != nil {
t.Error(
"For", pair.raw,
"Failed to parse line:", err,
)
}
validateTruncateError(pair, err, t)
if string(line) != pair.raw {
t.Error(
@ -373,7 +377,7 @@ func TestErrorLineTooLongGeneration(t *testing.T) {
message.SetTag(fmt.Sprintf("+client-tag-%d", i), "ok")
}
line, err = message.LineBytesStrict(true, 0)
if err != ErrorLineTooLong {
if err != ErrorTagsTooLong {
t.Error(err)
}
@ -382,7 +386,7 @@ func TestErrorLineTooLongGeneration(t *testing.T) {
message.SetTag(fmt.Sprintf("server-tag-%d", i), "ok")
}
line, err = message.LineBytesStrict(true, 0)
if err != ErrorLineTooLong {
if err != ErrorTagsTooLong {
t.Error(err)
}
@ -394,7 +398,7 @@ func TestErrorLineTooLongGeneration(t *testing.T) {
}
// client cannot send this much tag data:
line, err = message.LineBytesStrict(true, 0)
if err != ErrorLineTooLong {
if err != ErrorTagsTooLong {
t.Error(err)
}
// but a server can, since the tags are split between client and server budgets:
@ -404,6 +408,109 @@ func TestErrorLineTooLongGeneration(t *testing.T) {
}
}
var truncateTests = []string{
"x", // U+0078, Latin Small Letter X, 1 byte
"ç", // U+00E7, Latin Small Letter C with Cedilla, 2 bytes
"ꙮ", // U+A66E, Cyrillic Letter Multiocular O, 3 bytes
"🐬", // U+1F42C, Dolphin, 4 bytes
}
func assertEqual(found, expected interface{}) {
if !reflect.DeepEqual(found, expected) {
panic(fmt.Sprintf("expected %#v, found %#v", expected, found))
}
}
func buildPingParam(initialLen, minLen int, encChar string) (result string) {
var out strings.Builder
for i := 0; i < initialLen; i++ {
out.WriteByte('a')
}
for out.Len() <= minLen {
out.WriteString(encChar)
}
return out.String()
}
func min(i, j int) int {
if i < j {
return i
}
return j
}
func TestTruncate(t *testing.T) {
// OK, this test is weird: we're going to build a line with a final parameter
// that consists of a bunch of a's, then some nonzero number of repetitions
// of a different UTF8-encoded codepoint. we'll test all 4 possible lengths
// for a codepoint, and a number of different alignments for the codepoint
// relative to the 512-byte boundary. in all cases, we should produce valid
// UTF8, and truncate at most 3 bytes below the 512-byte boundary.
for idx, s := range truncateTests {
// sanity check that we have the expected lengths:
assertEqual(len(s), idx+1)
r, _ := utf8.DecodeRuneInString(s)
if r == utf8.RuneError {
panic("invalid codepoint in test suite")
}
// "PING [param]\r\n", max parameter size is 512-7=505 bytes
for initialLen := 490; initialLen < 500; initialLen++ {
for i := 1; i < 50; i++ {
param := buildPingParam(initialLen, initialLen+i, s)
msg := MakeMessage(nil, "", "PING", param)
msgBytes, err := msg.LineBytesStrict(false, 512)
msgBytesNonTrunc, _ := msg.LineBytes()
if len(msgBytes) == len(msgBytesNonTrunc) {
if err != nil {
t.Error("message was not truncated, but got error", err)
}
} else {
if err != ErrorBodyTooLong {
t.Error("message was truncated, but got error", err)
}
}
if len(msgBytes) > 512 {
t.Errorf("invalid serialized length %d", len(msgBytes))
}
if len(msgBytes) < min(512-3, len(msgBytesNonTrunc)) {
t.Errorf("invalid serialized length %d", len(msgBytes))
}
if !utf8.Valid(msgBytes) {
t.Errorf("PING %s encoded to invalid UTF8: %#v\n", param, msgBytes)
}
// skip over "PING "
first, _ := utf8.DecodeRune(msgBytes[5:])
assertEqual(first, rune('a'))
last, _ := utf8.DecodeLastRune(bytes.TrimSuffix(msgBytes, []byte("\r\n")))
assertEqual(last, r)
}
}
}
}
func TestTruncateNonUTF8(t *testing.T) {
for l := 490; l < 530; l++ {
var buf strings.Builder
for i := 0; i < l; i++ {
buf.WriteByte('\xff')
}
param := buf.String()
msg := MakeMessage(nil, "", "PING", param)
msgBytes, err := msg.LineBytesStrict(false, 512)
if !(err == nil || err == ErrorBodyTooLong) {
panic(err)
}
if len(msgBytes) > 512 {
t.Errorf("invalid serialized length %d", len(msgBytes))
}
// full length is "PING <param>\r\n", 7+len(param)
if len(msgBytes) < min(512-3, 7+len(param)) {
t.Errorf("invalid serialized length %d", len(msgBytes))
}
}
}
func BenchmarkGenerate(b *testing.B) {
msg := MakeMessage(
map[string]string{"time": "2019-02-28T08:12:43.480Z", "account": "shivaram"},