1
+ /**
2
+ * Testing for token_utils.ts
3
+ */
4
+
5
+ import 'jasmine' ;
6
+
7
+ import * as tokenUtils from './token_utils' ;
8
+
9
+ describe ( 'cleanSpmText test' , ( ) => {
10
+ it ( 'cleans magic underscores from SPM output' , ( ) => {
11
+ const text = 'Summarize▁this▁sentence:\n\nOnce▁upon▁a▁time' ;
12
+ expect ( tokenUtils . cleanSpmText ( text ) )
13
+ . toEqual ( 'Summarize this sentence:\n\nOnce upon a time' ) ;
14
+ } ) ;
15
+ } ) ;
16
+
17
+ describe ( 'groupTokensByRegexPrefix test' , ( ) => {
18
+ [ {
19
+ testcaseName : 'groups tokens by word' ,
20
+ tokens : [ 'Sum' , 'mar' , 'ize' , '▁this' , '▁sent' , 'ence' , ':' ] ,
21
+ regex : / [ ▁ \s ] + / g,
22
+ expectedGroups : [ [ 'Sum' , 'mar' , 'ize' ] , [ '▁this' ] , [ '▁sent' , 'ence' , ':' ] ] ,
23
+ } ,
24
+ {
25
+ testcaseName : 'groups tokens by word, handling newlines' ,
26
+ tokens : [
27
+ 'Sum' , 'mar' , 'ize' , '▁this' , '▁sent' , 'ence' , ':' , '\n' , '\n' , 'Once' ,
28
+ '▁upon' , '▁a' , '▁time'
29
+ ] ,
30
+ // Consecutive newlines should be their own segment.
31
+ // Start a new word on the first non-\n afterwards.
32
+ regex : / ( [ ▁ \s ] + ) | (?< = \n ) [ ^ \n ] / g,
33
+ expectedGroups : [
34
+ [ 'Sum' , 'mar' , 'ize' ] , [ '▁this' ] , [ '▁sent' , 'ence' , ':' ] , [ '\n' , '\n' ] ,
35
+ [ 'Once' ] , [ '▁upon' ] , [ '▁a' ] , [ '▁time' ]
36
+ ] ,
37
+ } ,
38
+ {
39
+ testcaseName : 'groups tokens by sentence, simple version' ,
40
+ tokens : [
41
+ 'Sent' , 'ence' , '▁one' , '.' , '▁Sent' , 'ence' , '▁two' , '!' , '▁Sent' ,
42
+ 'ence' , '▁three' , '?'
43
+ ] ,
44
+ regex : / (?< = [ . ? ! ] ) [ ▁ \s ] + / g,
45
+ expectedGroups : [
46
+ [ 'Sent' , 'ence' , '▁one' , '.' ] ,
47
+ [ '▁Sent' , 'ence' , '▁two' , '!' ] ,
48
+ [ '▁Sent' , 'ence' , '▁three' , '?' ] ,
49
+ ] ,
50
+ } ,
51
+ {
52
+ testcaseName : 'groups tokens by sentence, handling newlines' ,
53
+ tokens : [
54
+ 'Sum' , 'mar' , 'ize' , '▁this' , '▁sent' , 'ence' , ':' , '\n' , '\n' , 'Once' ,
55
+ '▁upon' , '▁a' , '▁time'
56
+ ] ,
57
+ // Sentence start is one of:
58
+ // - a run of consecutive \n as its own segment
59
+ // - any non-\n following \n
60
+ // - whitespace or magic underscore following punctuation [.?!]
61
+ regex : / ( \n + ) | ( (?< = \n ) [ ^ \n ] ) | ( (?< = [ . ? ! ] ) ( [ ▁ \s ] + ) ) / g,
62
+ expectedGroups : [
63
+ [ 'Sum' , 'mar' , 'ize' , '▁this' , '▁sent' , 'ence' , ':' ] , [ '\n' , '\n' ] ,
64
+ [ 'Once' , '▁upon' , '▁a' , '▁time' ]
65
+ ] ,
66
+ } ,
67
+ {
68
+ testcaseName : 'groups tokens by line' ,
69
+ tokens : [
70
+ 'Sum' , 'mar' , 'ize' , '▁this' , '▁sent' , 'ence' , ':' , '\n' , '\n' , 'Once' ,
71
+ '▁upon' , '▁a' , '▁time'
72
+ ] ,
73
+ // Line start is either:
74
+ // - a run of consecutive \n as its own segment
75
+ // - any non-\n following \n
76
+ regex : / ( \n + ) | ( [ ^ \n ] + ) / g,
77
+ expectedGroups : [
78
+ [ 'Sum' , 'mar' , 'ize' , '▁this' , '▁sent' , 'ence' , ':' ] , [ '\n' , '\n' ] ,
79
+ [ 'Once' , '▁upon' , '▁a' , '▁time' ]
80
+ ] ,
81
+ } ,
82
+ ] . forEach ( ( { testcaseName, tokens, regex, expectedGroups} ) => {
83
+ it ( testcaseName , ( ) => {
84
+ const groups = tokenUtils . groupTokensByRegexPrefix ( tokens , regex ) ;
85
+ expect ( groups ) . toEqual ( expectedGroups ) ;
86
+ } ) ;
87
+ } ) ;
88
+ } ) ;
0 commit comments