:- module(pc_text_processing,
	[ only_whitespaces/1,
	  is_whitespace/1,
	  is_CR/1, 
	  is_LF/1,
	  is_newline/1,
	  up_to_eol/2,
	  pos_on_eol/2,	 
	  up_to_eow/2,
	  pos_on_eow/2,
	  nth_line/3,
	  nth_word/3,
	  line_number/3,
	  find_line/4,
	  take_rest/3,
	  take_begin/3,
	  next_line/3,
	  split_string/3,
	  substr/4,
	  count_words/2,
	  trim_begin/2,
	  trim_end/2,
	  trim/2,
	  begin_with/2,
	  end_with/2,
	  substring_pos/3,
	  word_pos/3,
	  word_number/3,
	  replace_all/4,
	  replace_first/4,
	  downcase_str/2,
	  upcase_str/2,
	  downcase_char/2,
	  upcase_char/2,
	  case_insensitive/2
	],	
	[ assertions
	]).

:- use_module(library(lists)).

:- doc(author, "Christian Giménez").
:- doc(copyright, "
    pc_text_processing.pl
    Copyright (C) 2012  Giménez, Christian N.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

    Miércoles 07 De Noviembre Del 2012    
").

:- doc(title, "Processing Text").

:- doc(module, "Prolog still doesn't have a text processing library, well, here it is.

This is a text processing library that gives you predicates for changing or retrieving strings from a large text.

@section{Conventions}

This library use units as words or characters depending on the predicate you use. So character 3 of \"hello world\" is the first \"l\" and the the word number 2 is \"world\".

When we say \"the n-th position of the string\" we mean the n-th character of the string. So character 1 of \"hello world\" is the \"h\". Remember that the predicate @pred{nth/3} use the same convention.
").


:- pred is_whitespace(C) # "true iff @var{C} is a whitespace. Whitespaces are space(ASCII:32), tab(ASCII:9), newline(ASCII:10), return(ASCII:13)".

is_whitespace(32).
is_whitespace(9).
is_whitespace(10).
is_whitespace(13).

:- pred is_CR(C) # "true iff @var{C} is the Carriage Return(ASCII: 10).".
is_CR(10).
:- pred is_LF(C) # "true iff @var{C} is the LineFeed(ASCII: 13).".
is_LF(13).

:- pred is_newline(C) # "true iff @var{C} is the Carriage Return(ASCII: 10) or the LineFeed(ASCII: 13).".
is_newline(10).
is_newline(13).

:- pred up_to_eol(Text, L) # "true iff @var{L} is the first line of the text up to the '\\n' character(end of line or EOL). ".
up_to_eol([], []).

up_to_eol([C|_], []) :- 
	is_newline(C),!.

up_to_eol([C|Text], [C|L]) :- 
	up_to_eol(Text, L).

:- pred pos_on_eol(+Text, Pos) # "true iff @var{Pos} is the position where the first EOL is in @var{Text}.".
pos_on_eol(Text, Pos) :-
	up_to_eol(Text, Line), 
	length(Line, Pos).
	
:- pred up_to_eow(Text, W) # "true iff @var{W} is the first word of @var{Text}(take the letters, numbers, and symbols up to the first space character).".

up_to_eow([], []).

up_to_eow([C|_], []) :- 
	is_whitespace(C),!.

up_to_eow([C|Text], [C|L]) :- 
	up_to_eow(Text, L).

:- pred pos_on_eow(+Text, Pos) # "true iff @var{Pos} is the position of the first EOW in @var{Text}.".
pos_on_eow(Text, Pos) :-
	up_to_eow(Text, Word),
	length(Word,Pos).

:- pred nth_line(+Text, +N, L) # "true iff @var{L} is the @var{N}th line of @var{Text}.".
nth_line([], _, "").

nth_line(_, 0, "").

nth_line(Text, 1, L) :- %% Base case: the first line
	up_to_eol(Text,L),!.

nth_line([Char|Rest], N, L) :-
	is_newline(Char),
	N2 is N - 1, 
	nth_line(Rest, N2, L).

nth_line([_Char|Rest], N, L) :-	
	N > 1,
	nth_line(Rest,N,L).

:- pred nth_word(+Text, +N, Word) # "true iff @var{Word} is the @var{N}th word of the @var{Text}.

Another interesting predicate is @pred{word_number/3}. Given the @var{Word}, return the word number.".

nth_word(_Text, 0, "") :- !.

nth_word("", _N, "") :- !.

nth_word(Text, 1, Word) :- 
	up_to_eow(Text, Word).

nth_word(Text, N, Word) :-
	pos_on_eow(Text, Pos),
	Pos1 is Pos + 1,
	take_rest(Text, Pos1, Rest1),
	trim_begin(Rest1,Rest),
	N1 is N - 1,
	nth_word(Rest, N1, Word).

:- pred line_number(+Text, +L ,N) # "true iff line string @var{L} is at line number @var{N} in the @var{Text}.".
line_number(Text, L, N) :-
	find_line(Text,L,1,N).

:- pred find_line(+Text, +L, +Start, N) # "true iff line string @var{L} is at line number @var{N} searching from line number @var{StartSearch} in the @var{Text}.".

find_line(Text,L,StartSearch,StartSearch) :- % Base case: the line is at the starting line.
	up_to_eol(Text,L),!.

find_line([Char|Rest],L,StartSearch,N) :-
	is_newline(Char),
	StartSearch2 is StartSearch + 1,
	find_line(Rest,L,StartSearch2,N).

find_line([C|Rest],L,StartSearch,N) :- % Keep going up to the newline character(ignore the actual line). 
	\+ is_newline(C),
	find_line(Rest,L,StartSearch,N).


:- pred take_rest(+Text, +Pos, Rest) # "true iff @var{Rest} is the string that begins from position @var{Pos} up to the end.".

take_rest("", 1, "") :- !.

take_rest(Text, 1 , Text) :- !.

take_rest([_C|R], Pos,  Rest) :-
	Pos2 is Pos - 1,
	take_rest(R, Pos2, Rest).

:- pred take_begin(+Text, +Pos, Begin) # "true iff @var{Begin} is the first part of the @var{Text} up to @var{Pos} position. @var{Pos} can be understood as a Length of the begining of the @var{Text} you want.

For example:

@tt{take_begin(\"Hello word, how are you?\", 10, \"Hello word\").} is true.".

take_begin(_Text, 0, "") :- !.

take_begin([C|Rest], Pos, [C|Rest2]) :-
	Pos1 is Pos - 1,
	take_begin(Rest, Pos1, Rest2).


:- pred next_line(+Text, +ActualPos, LinePos) # "true iff @var{LinePos} is the position of the next line starting from the @var{ActualPos} position.".

next_line(Text, ActualPos, LinePos) :-
	take_rest(Text,ActualPos, Rest), %% We take out the begining up to ActualPos
	next_line2(Rest, ActualPos, LinePos).

:- pred next_line2(+Text, +ActualPos, LinePos) # "true iff @var{LinePos} is the number @var{ActualPos} + Amount_of_characters_up_to_EOL. This start from the begin of the @var{Text} counting up to the EOL.".

next_line2("", _ActualPos, 0) :- !. % Base case: no text!

next_line2([C|_Text], ActualPos, ActualPos1) :- % Base case: this is the next line.	
	is_newline(C),!,
	ActualPos1 is ActualPos + 1.
	
next_line2([C|Rest], ActualPos, LinePos) :-
	\+ is_newline(C),
	NextPos is ActualPos + 1,
	next_line2(Rest, NextPos, LinePos).

:- pred split_string(+Text, +Separator, LstResult) # "Split the @var{Text} into varios strings according to the @var{Separator} String.
For example:

@tt{split_string(\"Hello word, how are you?\", \" \", [\"Hello\", \"word,\", \"how\", \"are\", \"you?\"]} is true".

split_string(Text, "", Text) :- !.

split_string([], _Separator, []) :- !.

split_string(Text, Separator, [Text]) :-
	\+ list_concat([_, Separator, _], Text),!.

split_string(Text, Separator, [Elt|LstResult]) :-
	list_concat([Elt, Separator, Rest], Text),!,
	split_string(Rest, Separator, LstResult).


:- pred substr(+Text, BeginPos, Length, Substring) # "true iff @var{Substring} is the string that is inside @var{Text} from the @var{BeginPos} up to @var{BeginPos} + @var{Length}. This is useful for extracting string from a text.".

substr(Text, BeginPos, Length, Substring) :-
	take_rest(Text, BeginPos, S1), 
	take_begin(S1, Length, Substring).

:- pred count_words(+Text, Amount) # "true iff @var{Amount} is the amount of words that @var{Text} has.".

count_words("", 0) :- !.

count_words(Text, Amount) :-
	is_whitespace(C),
	split_string(Text, [C], L),
	length(L, Amount),!.
	
	
:- pred trim_begin(+Text, Result) # "true iff @var{Result} is @var{Text} with no whitespaces at the begining.".

trim_begin("", "") :- !.
trim_begin([C|Rest], Rest1) :-
	is_whitespace(C),
	trim_begin(Rest, Rest1).

trim_begin([C|Text], [C|Text]) :-
	\+ is_whitespace(C),!.

:- pred trim_end(+Text, Result) # "true iff @var{Result} is @var{Text} with no whitespaces at the ending.".
	
trim_end("", "") :- !.

trim_end(Rest, "") :-
	only_whitespaces(Rest),!.

trim_end([C|Rest], [C|Rest_Trimmed]) :-
	trim_end(Rest, Rest_Trimmed).


:- pred trim(+Text, TrimmedText) # "true iff @var{TrimmedText} has no spaces at the begining and at the end. 

Is the same as:

	@tt{trim_end(Text, EndTrimmed), trim_begin(EndTrimmed, TrimmedText).}
".

trim(Text, TrimmedText) :-
	trim_end(Text, EndTrimmed), 
	trim_begin(EndTrimmed, TrimmedText),!.


:- pred only_whitespaces(+Text) # "true iff @var{Text}  is a string conformed only by whitespaces.".

only_whitespaces("") :- !.

only_whitespaces([C|Rest]) :-
	is_whitespace(C),
	only_whitespaces(Rest).
	

:- pred begin_with(+Text, Begin) # "true iff @var{Text} begin with the string @var{Begin}.".

begin_with(_Text, "").

begin_with([C|Text], [C|Rest]) :-
	begin_with(Text, Rest).

:- pred end_with(+Text, Ending) # "true iff the ending of the @var{Text} is @var{Ending}.".
end_with(Text, Ending) :-
	list_concat([_A, Ending, []], Text).

:- pred substring_pos(+Text, +Substring, Pos) # "true iff the first ocurrence of @var{Substring} is at position @var{Pos} in the @var{Text}.".

substring_pos(_Text, "", 1) :- !. % redcut.

substring_pos(Text, Substring, 1) :-
	begin_with(Text, Substring),!. % redcut

substring_pos([_C|Text], Substring, Pos) :- 
	substring_pos(Text, Substring, Pos1),
	Pos is Pos1 + 1.

:- pred word_pos(+Text, +Word, Pos) # "true iff the position of the @var{Word} is @var{Pos} in @var{Text}. This predicate counts character by character, but @pred{word_number/3} count word by word.

This predicate ensures that what you tiped is a word in a text, not only a substring.
For example:

@tt{substring_pos(\"hello word, how are you?\", \"ell\", _).} will be true, but \"ell\" is not a word in the text.".

word_pos("", _Word , 0) :- !.

word_pos(Text, Word, 1) :- 
	up_to_eow(Text, Word),!.

word_pos(Text, Word, Pos) :-
	pos_on_eow(Text, Pos1),
	Pos2 is Pos1 + 2,
	take_rest(Text, Pos2, Rest),
	word_pos(Rest, Word, Pos3),
	Pos is Pos2 + Pos3 - 1.

:- pred word_number(+Text, +Word, N) # "true iff the @var{Word} is the @var{N}th word of the @var{Text}. This predicate counts word by word, but @pred{word_pos/3} count character by character(position).

Another interesting predicate is @pred{nth_word/3}, this is the inverse: given the number @var{N} return the nth word.".

word_number("", _Word , 0) :- !.

word_number(Text, Word, 1) :- 
	up_to_eow(Text, Word),!.

word_number(Text, Word, N) :-
	pos_on_eow(Text, Pos1),
	Pos2 is Pos1 + 2,
	take_rest(Text, Pos2, Rest),
	word_number(Rest, Word, N1),
	N is N1 + 1.

:- pred replace_all(+Text, +Str, +Dest, Replaced) # "True iff @var{Replace} is the string with all substrings @var{Str} in @var{Text} replaced with string @var{Dest}.".

replace_all("", _Str, _Dest, "") :- !.

replace_all(Text, "", _Dest, Text) :- !.

replace_all(Text, Str, Dest, Replaced) :-
	begin_with(Text, Str),!,
	length(Str, StrLen1),
	StrLen is StrLen1 + 1,
	take_rest(Text, StrLen, Rest),
	replace_all(Rest, Str, Dest, Replaced2),
	append(Dest, Replaced2, Replaced).

replace_all([C|Rest], Str, Dest, [C|RestRep]) :-
	replace_all(Rest, Str, Dest, RestRep).


:- pred replace_first(+Text, +Str, +Dest, Replaced) # "The same as @pred{replace_all/4} but replacing only one ocurrence.".

replace_first("", _Str, _Dest, "") :- !.

replace_first(Text, "", _Dest, Text) :- !.

replace_first(Text, Str, Dest, Replaced) :-
	begin_with(Text, Str),!,
	length(Str, StrLen1),
	StrLen is StrLen1 + 1,
	take_rest(Text, StrLen, Rest),
	append(Dest, Rest, Replaced).

replace_first([C|Rest], Str, Dest, [C|RestRep]) :-
	replace_first(Rest, Str, Dest, RestRep).

:- pred case_insensitive(+String, String2) # "true iff @var{String} is the same string as @var{String2} but with characters downcased or upcased.".

case_insensitive("", "") :- !.

case_insensitive([C|Rest], [B|Rest2]) :-
	upcase_char(C,B),
	case_insensitive(Rest, Rest2).

case_insensitive([C|Rest], [B|Rest2]) :-
	downcase_char(C,B), 
	case_insensitive(Rest, Rest2).

case_insensitive([C|Rest], [C|Rest2]) :-
	case_insensitive(Rest, Rest2).

:- pred upcase_char(+Char, Upcase) # "true iff @var{Upcase} is the same character as @var{Char} but upcased.".

upcase_char(Char, Upcase) :-
	Char >= 97,
	Char =< 122,!,
	Upcase is Char - 32.

upcase_char(Char, Char) :- !.

:- pred downcase_char(+Char, Downcase) # "true iff @var{Downcase} is the same character as @var{Char} but downcased.".

downcase_char(Char, Downcase) :-
	Char >= 65, 
	Char =< 90,!,
	Downcase is Char + 32.

downcase_char(Char, Char) :- !.

:- pred upcase_str(+String, UpCased) # "true iff @var{Upcased} is the same string as @var{String} but with all characters upcased.".

upcase_str("", "") :- !.

upcase_str([C|Rest], [D|Rest2]) :-
	upcase_char(C, D),
	upcase_str(Rest, Rest2).

:- pred downcase_str(+String, Downcased) # "true iff @var{Downcased} is the same string as @var{String} but with all characters downcased.".

downcase_str("", "") :- !.

downcase_str([C|Rest], [D|Rest2]) :-
	downcase_char(C,D),
	downcase_str(Rest, Rest2).