-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.erl
executable file
·84 lines (70 loc) · 3.22 KB
/
index.erl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
-module(index).
-export([create/1]).
% Given a file name, returns the sorted index with
% entries that look like
%
% { "foo" , [{3,5},{7,7},{11,13}] }
%
-spec create(string()) -> [{string(), [{integer(), integer()}]}].
create(Name) ->
Lines = number(get_file_contents(Name)),
Words = group(sort(split(Lines))),
Words.
% Zip a list with the index in the for each item in the list
-spec number([string()]) -> [{string(), integer()}].
number(Lines) -> lists:zip(Lines, lists:seq(1, length(Lines))).
% Split each numbered line into words
-spec split([{string(), integer()}]) -> [{string(), integer()}].
split(Lines) -> lists:flatmap(fun({Line, Index}) -> lists:map(fun(Word) -> {Word, Index} end, words(Line)) end, Lines).
% Split a line into words
-spec words(string()) -> [string()].
words(Line) -> normalize(string:tokens((Line), " .!?,`")).
% Remove non-words and normalize spelling
-spec normalize([string()]) -> [string()].
normalize(Words) -> lists:filtermap(fun(Word) ->
case valid(Word) of
true -> { true, string:to_lower(Word) };
false -> false
end
end, Words).
% Check if a word should be included
-spec valid(string()) -> boolean().
valid(Word) -> lists:all(fun(Letter) when $a =< Letter, Letter =< $z -> true;
(Letter) when $A =< Letter, Letter =< $Z -> true;
($') -> true;
($-) -> true;
(_) -> false end, Word) andalso length(Word) >= 3.
% Sort word/line number pairs by word
-spec sort([{string(), integer()}]) -> [{string(), integer()}].
sort(Words) -> lists:sort(fun({WordA, _}, {WordB, _}) -> WordA =< WordB end, Words).
% Group identical words and form index ranges
-spec group([{string(), integer()}]) -> [{string(), [{integer(), integer()}]}].
group(Words) -> lists:foldr(fun coalesce/2, [], Words).
% Helper for group/1 to add one word/index to the accumulator
-spec coalesce({string(), integer()}, [{string(), [{integer(), integer()}]}]) -> [{string(), [{integer(), integer()}]}].
coalesce({Word, Index}, []) -> [{Word, [{Index, Index}]}];
coalesce({Word, Index}, Acc = [{Word, [{Index, _}|_]}|_]) -> Acc;
coalesce({Word, Index}, [{Word, [{Low, High}|Indexes]}|Words]) when Index =:= Low - 1 -> [{Word, [{Index, High}|Indexes]}|Words];
coalesce({Word, Index}, [{Word, Indexes}|Words]) -> [{Word, [{Index, Index}|Indexes]}|Words];
coalesce({Word, Index}, Words) -> [{Word, [{Index, Index}]}|Words].
% Used to read a file into a list of lines.
% Example files available in:
% gettysburg-address.txt (short)
% dickens-christmas.txt (long)
% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.
get_file_contents(Name) ->
{ok,File} = file:open(Name,[read]),
Rev = get_all_lines(File,[]),
lists:reverse(Rev).
% Auxiliary function for get_file_contents.
% Not exported.
get_all_lines(File,Partial) ->
case io:get_line(File,"") of
eof ->
file:close(File),
Partial;
Line ->
{Strip,_} = lists:split(length(Line)-1,Line),
get_all_lines(File,[Strip|Partial])
end.