package textseg import ( "errors" "unicode/utf8" ) // Generated from grapheme_clusters.rl. DO NOT EDIT %%{ # (except you are actually in grapheme_clusters.rl here, so edit away!) machine graphclust; write data; }%% var Error = errors.New("invalid UTF8 text") // ScanGraphemeClusters is a split function for bufio.Scanner that splits // on grapheme cluster boundaries. func ScanGraphemeClusters(data []byte, atEOF bool) (int, []byte, error) { if len(data) == 0 { return 0, nil, nil } // Ragel state cs := 0 // Current State p := 0 // "Pointer" into data pe := len(data) // End-of-data "pointer" ts := 0 te := 0 act := 0 eof := pe // Make Go compiler happy _ = ts _ = te _ = act _ = eof startPos := 0 endPos := 0 %%{ include GraphemeCluster "grapheme_clusters_table.rl"; action start { startPos = p } action end { endPos = p } action emit { return endPos+1, data[startPos:endPos+1], nil } ZWJGlue = ZWJ (Glue_After_Zwj | E_Base_GAZ Extend* E_Modifier?)?; AnyExtender = Extend | ZWJGlue | SpacingMark; Extension = AnyExtender*; ReplacementChar = (0xEF 0xBF 0xBD); CRLFSeq = CR LF; ControlSeq = Control | ReplacementChar; HangulSeq = ( L+ (((LV? V+ | LVT) T*)?|LV?) | LV V* T* | V+ T* | LVT T* | T+ ) Extension; EmojiSeq = (E_Base | E_Base_GAZ) Extend* E_Modifier? Extension; ZWJSeq = ZWJGlue Extension; EmojiFlagSeq = Regional_Indicator Regional_Indicator? Extension; UTF8Cont = 0x80 .. 0xBF; AnyUTF8 = ( 0x00..0x7F | 0xC0..0xDF . UTF8Cont | 0xE0..0xEF . UTF8Cont . UTF8Cont | 0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont ); # OtherSeq is any character that isn't at the start of one of the extended sequences above, followed by extension OtherSeq = (AnyUTF8 - (CR|LF|Control|ReplacementChar|L|LV|V|LVT|T|E_Base|E_Base_GAZ|ZWJ|Regional_Indicator|Prepend)) Extension; # PrependSeq is prepend followed by any of the other patterns above, except control characters which explicitly break PrependSeq = Prepend+ (HangulSeq|EmojiSeq|ZWJSeq|EmojiFlagSeq|OtherSeq)?; CRLFTok = CRLFSeq >start @end; ControlTok = ControlSeq >start @end; HangulTok = HangulSeq >start @end; EmojiTok = EmojiSeq >start @end; ZWJTok = ZWJSeq >start @end; EmojiFlagTok = EmojiFlagSeq >start @end; OtherTok = OtherSeq >start @end; PrependTok = PrependSeq >start @end; main := |* CRLFTok => emit; ControlTok => emit; HangulTok => emit; EmojiTok => emit; ZWJTok => emit; EmojiFlagTok => emit; PrependTok => emit; OtherTok => emit; # any single valid UTF-8 character would also be valid per spec, # but we'll handle that separately after the loop so we can deal # with requesting more bytes if we're not at EOF. *|; write init; write exec; }%% // If we fall out here then we were unable to complete a sequence. // If we weren't able to complete a sequence then either we've // reached the end of a partial buffer (so there's more data to come) // or we have an isolated symbol that would normally be part of a // grapheme cluster but has appeared in isolation here. if !atEOF { // Request more return 0, nil, nil } // Just take the first UTF-8 sequence and return that. _, seqLen := utf8.DecodeRune(data) return seqLen, data[:seqLen], nil }