HellPot/heffalump/markov.go

package heffalump

import (
	"bufio"
	"io"
	"math/rand"
	"strings"
	"unicode"
	"unicode/utf8"

	"git.tcp.direct/kayos/common/squish"
)

var DefaultMarkovMap MarkovMap

func init() {
	// DefaultMarkovMap is a Markov chain based on src.
	src, err := squish.UnpackStr(srcGz)
	if err != nil {
		panic(err)
	}
	if len(src) < 1 {
		panic("failed to unpack source")
	}
	DefaultMarkovMap = MakeMarkovMap(strings.NewReader(src))
	DefaultHeffalump = NewHeffalump(DefaultMarkovMap, 100*1<<10)
}

// ScanHTML is a basic split function for a Scanner that returns each
// space-separated word of text or HTML tag, with surrounding spaces deleted.
// It will never return an empty string. The definition of space is set by
// unicode.IsSpace.
func ScanHTML(data []byte, atEOF bool) (advance int, token []byte, err error) {
	// Skip leading spaces.
	var r rune
	var start = 0
	for width := 0; start < len(data); start += width {
		r, width = utf8.DecodeRune(data[start:])
		if !unicode.IsSpace(r) {
			break
		}
	}
	switch {
	case r == '<':
		// Scan until closing bracket
		for i := start; i < len(data); i++ {
			if data[i] == '>' {
				return i + 1, data[start : i+1], nil
			}
		}
	default:
		// Scan until space, marking end of word.
		for width, i := 0, start; i < len(data); i += width {
			var r rune
			r, width = utf8.DecodeRune(data[i:])
			if unicode.IsSpace(r) {
				return i + width, data[start:i], nil
			}
			if r == '<' {
				return i, data[start:i], nil
			}
		}
	}
	// If we're at EOF, we have a final, non-empty, non-terminated word. Return it.
	if atEOF && len(data) > start {
		return len(data), data[start:], nil
	}
	// Request more data.
	return start, nil, nil
}

type tokenPair [2]string

// MarkovMap is a map that acts as a Markov chain generator.
type MarkovMap map[tokenPair][]string

// MakeMarkovMap makes an empty MakeMarkov and fills it with r.
func MakeMarkovMap(r io.Reader) MarkovMap {
	m := MarkovMap{}
	m.Fill(r)
	return m
}

// Fill adds all the tokens in r to a MarkovMap
func (mm MarkovMap) Fill(r io.Reader) {
	var w1, w2, w3 string

	s := bufio.NewScanner(r)
	s.Split(ScanHTML)
	for s.Scan() {
		w3 = s.Text()
		mm.Add(w1, w2, w3)
		w1, w2 = w2, w3
	}

	mm.Add(w1, w2, w3)
}

// Add adds a three token sequence to the map.
func (mm MarkovMap) Add(w1, w2, w3 string) {
	p := tokenPair{w1, w2}
	mm[p] = append(mm[p], w3)
}

// Get pseudo-randomly chooses a possible suffix to w1 and w2.
func (mm MarkovMap) Get(w1, w2 string) string {
	p := tokenPair{w1, w2}
	suffix, ok := mm[p]
	if !ok {
		return ""
	}
	// We don't care about cryptographically sound entropy here, ignore gosec G404.
	/* #nosec */
	r := rand.Intn(len(suffix))
	return suffix[r]
}

// Read fills p with data from calling Get on the MarkovMap.
func (mm MarkovMap) Read(p []byte) (n int, err error) {
	var w1, w2, w3 string
	for {
		w3 = mm.Get(w1, w2)
		if n+len(w3)+1 >= len(p) {
			break
		}
		n += copy(p[n:], w3)
		n += copy(p[n:], "\n")
		w1, w2 = w2, w3
	}
	return
}
Broken: begin restructure 2021-09-15 13:43:01 +00:00			`package heffalump`
Initial working version 2016-12-11 01:38:18 +00:00
			`import (`
			`"bufio"`
			`"io"`
			`"math/rand"`
			`"strings"`
			`"unicode"`
			`"unicode/utf8"`
Minor adjustments 2022-07-26 05:46:04 +00:00
			`"git.tcp.direct/kayos/common/squish"`
Initial working version 2016-12-11 01:38:18 +00:00			`)`

Begin limiting writer implementation 2022-09-11 10:04:46 +00:00			`var DefaultMarkovMap MarkovMap`

			`func init() {`
			`// DefaultMarkovMap is a Markov chain based on src.`
Minor adjustments 2022-07-26 05:46:04 +00:00			`src, err := squish.UnpackStr(srcGz)`
Begin limiting writer implementation 2022-09-11 10:04:46 +00:00			`if err != nil {`
			`panic(err)`
			`}`
			`if len(src) < 1 {`
			`panic("failed to unpack source")`
			`}`
			`DefaultMarkovMap = MakeMarkovMap(strings.NewReader(src))`
			`DefaultHeffalump = NewHeffalump(DefaultMarkovMap, 100*1<<10)`
			`}`

Initial working version 2016-12-11 01:38:18 +00:00			`// ScanHTML is a basic split function for a Scanner that returns each`
			`// space-separated word of text or HTML tag, with surrounding spaces deleted.`
			`// It will never return an empty string. The definition of space is set by`
			`// unicode.IsSpace.`
			`func ScanHTML(data []byte, atEOF bool) (advance int, token []byte, err error) {`
			`// Skip leading spaces.`
			`var r rune`
Lint: general code cleanup and gzipping of source material 2021-09-15 08:28:09 +00:00			`var start = 0`
Initial working version 2016-12-11 01:38:18 +00:00			`for width := 0; start < len(data); start += width {`
			`r, width = utf8.DecodeRune(data[start:])`
			`if !unicode.IsSpace(r) {`
			`break`
			`}`
			`}`
Begin limiting writer implementation 2022-09-11 10:04:46 +00:00			`switch {`
			`case r == '<':`
Initial working version 2016-12-11 01:38:18 +00:00			`// Scan until closing bracket`
			`for i := start; i < len(data); i++ {`
			`if data[i] == '>' {`
			`return i + 1, data[start : i+1], nil`
			`}`
			`}`
Begin limiting writer implementation 2022-09-11 10:04:46 +00:00			`default:`
Initial working version 2016-12-11 01:38:18 +00:00			`// Scan until space, marking end of word.`
			`for width, i := 0, start; i < len(data); i += width {`
			`var r rune`
			`r, width = utf8.DecodeRune(data[i:])`
			`if unicode.IsSpace(r) {`
			`return i + width, data[start:i], nil`
			`}`
			`if r == '<' {`
			`return i, data[start:i], nil`
			`}`
			`}`
			`}`
			`// If we're at EOF, we have a final, non-empty, non-terminated word. Return it.`
			`if atEOF && len(data) > start {`
			`return len(data), data[start:], nil`
			`}`
			`// Request more data.`
			`return start, nil, nil`
			`}`

Better code organization, reusability 2016-12-11 04:14:03 +00:00			`type tokenPair [2]string`

Add documentation 2016-12-12 00:15:14 +00:00			`// MarkovMap is a map that acts as a Markov chain generator.`
Better code organization, reusability 2016-12-11 04:14:03 +00:00			`type MarkovMap map[tokenPair][]string`
Initial working version 2016-12-11 01:38:18 +00:00
Add documentation 2016-12-12 00:15:14 +00:00			`// MakeMarkovMap makes an empty MakeMarkov and fills it with r.`
Better code organization, reusability 2016-12-11 04:14:03 +00:00			`func MakeMarkovMap(r io.Reader) MarkovMap {`
			`m := MarkovMap{}`
			`m.Fill(r)`
			`return m`
			`}`
Initial working version 2016-12-11 01:38:18 +00:00
Add documentation 2016-12-12 00:15:14 +00:00			`// Fill adds all the tokens in r to a MarkovMap`
Better code organization, reusability 2016-12-11 04:14:03 +00:00			`func (mm MarkovMap) Fill(r io.Reader) {`
			`var w1, w2, w3 string`
Initial working version 2016-12-11 01:38:18 +00:00
Better code organization, reusability 2016-12-11 04:14:03 +00:00			`s := bufio.NewScanner(r)`
Initial working version 2016-12-11 01:38:18 +00:00			`s.Split(ScanHTML)`
			`for s.Scan() {`
Begin limiting writer implementation 2022-09-11 10:04:46 +00:00			`w3 = s.Text()`
Better code organization, reusability 2016-12-11 04:14:03 +00:00			`mm.Add(w1, w2, w3)`
			`w1, w2 = w2, w3`
			`}`

			`mm.Add(w1, w2, w3)`
			`}`

Add documentation 2016-12-12 00:15:14 +00:00			`// Add adds a three token sequence to the map.`
Better code organization, reusability 2016-12-11 04:14:03 +00:00			`func (mm MarkovMap) Add(w1, w2, w3 string) {`
			`p := tokenPair{w1, w2}`
			`mm[p] = append(mm[p], w3)`
			`}`

Docs: Fix misspelling 2016-12-12 00:21:35 +00:00			`// Get pseudo-randomly chooses a possible suffix to w1 and w2.`
Better code organization, reusability 2016-12-11 04:14:03 +00:00			`func (mm MarkovMap) Get(w1, w2 string) string {`
			`p := tokenPair{w1, w2}`
			`suffix, ok := mm[p]`
			`if !ok {`
			`return ""`
Initial working version 2016-12-11 01:38:18 +00:00			`}`
CI: https://github.com/securego/gosec/issues/469 2022-04-21 02:47:40 +00:00			`// We don't care about cryptographically sound entropy here, ignore gosec G404.`
			`/* #nosec */`
Better code organization, reusability 2016-12-11 04:14:03 +00:00			`r := rand.Intn(len(suffix))`
			`return suffix[r]`
Initial working version 2016-12-11 01:38:18 +00:00			`}`

Add documentation 2016-12-12 00:15:14 +00:00			`// Read fills p with data from calling Get on the MarkovMap.`
Better code organization, reusability 2016-12-11 04:14:03 +00:00			`func (mm MarkovMap) Read(p []byte) (n int, err error) {`
			`var w1, w2, w3 string`
			`for {`
			`w3 = mm.Get(w1, w2)`
			`if n+len(w3)+1 >= len(p) {`
Initial working version 2016-12-11 01:38:18 +00:00			`break`
			`}`
Better code organization, reusability 2016-12-11 04:14:03 +00:00			`n += copy(p[n:], w3)`
			`n += copy(p[n:], "\n")`
			`w1, w2 = w2, w3`
Initial working version 2016-12-11 01:38:18 +00:00			`}`
Better code organization, reusability 2016-12-11 04:14:03 +00:00			`return`
Initial working version 2016-12-11 01:38:18 +00:00			`}`