:- module(pc_text_processing, [ only_whitespaces/1, is_whitespace/1, is_CR/1, is_LF/1, is_newline/1, up_to_eol/2, pos_on_eol/2, up_to_eow/2, pos_on_eow/2, nth_line/3, nth_word/3, line_number/3, find_line/4, take_rest/3, take_begin/3, next_line/3, split_string/3, substr/4, count_words/2, trim_begin/2, trim_end/2, trim/2, begin_with/2, end_with/2, substring_pos/3, word_pos/3, word_number/3, replace_all/4, replace_first/4, downcase_str/2, upcase_str/2, downcase_char/2, upcase_char/2, case_insensitive/2 ], [ assertions ]). :- use_module(library(lists)). :- doc(author, "Christian Giménez"). :- doc(copyright, " pc_text_processing.pl Copyright (C) 2012 Giménez, Christian N. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Miércoles 07 De Noviembre Del 2012 "). :- doc(title, "Processing Text"). :- doc(module, "Prolog still doesn't have a text processing library, well, here it is. This is a text processing library that gives you predicates for changing or retrieving strings from a large text. @section{Conventions} This library use units as words or characters depending on the predicate you use. So character 3 of \"hello world\" is the first \"l\" and the the word number 2 is \"world\". When we say \"the n-th position of the string\" we mean the n-th character of the string. So character 1 of \"hello world\" is the \"h\". Remember that the predicate @pred{nth/3} use the same convention. "). :- pred is_whitespace(C) # "true iff @var{C} is a whitespace. Whitespaces are space(ASCII:32), tab(ASCII:9), newline(ASCII:10), return(ASCII:13)". is_whitespace(32). is_whitespace(9). is_whitespace(10). is_whitespace(13). :- pred is_CR(C) # "true iff @var{C} is the Carriage Return(ASCII: 10).". is_CR(10). :- pred is_LF(C) # "true iff @var{C} is the LineFeed(ASCII: 13).". is_LF(13). :- pred is_newline(C) # "true iff @var{C} is the Carriage Return(ASCII: 10) or the LineFeed(ASCII: 13).". is_newline(10). is_newline(13). :- pred up_to_eol(Text, L) # "true iff @var{L} is the first line of the text up to the '\\n' character(end of line or EOL). ". up_to_eol([], []). up_to_eol([C|_], []) :- is_newline(C),!. up_to_eol([C|Text], [C|L]) :- up_to_eol(Text, L). :- pred pos_on_eol(+Text, Pos) # "true iff @var{Pos} is the position where the first EOL is in @var{Text}.". pos_on_eol(Text, Pos) :- up_to_eol(Text, Line), length(Line, Pos). :- pred up_to_eow(Text, W) # "true iff @var{W} is the first word of @var{Text}(take the letters, numbers, and symbols up to the first space character).". up_to_eow([], []). up_to_eow([C|_], []) :- is_whitespace(C),!. up_to_eow([C|Text], [C|L]) :- up_to_eow(Text, L). :- pred pos_on_eow(+Text, Pos) # "true iff @var{Pos} is the position of the first EOW in @var{Text}.". pos_on_eow(Text, Pos) :- up_to_eow(Text, Word), length(Word,Pos). :- pred nth_line(+Text, +N, L) # "true iff @var{L} is the @var{N}th line of @var{Text}.". nth_line([], _, ""). nth_line(_, 0, ""). nth_line(Text, 1, L) :- %% Base case: the first line up_to_eol(Text,L),!. nth_line([Char|Rest], N, L) :- is_newline(Char), N2 is N - 1, nth_line(Rest, N2, L). nth_line([_Char|Rest], N, L) :- N > 1, nth_line(Rest,N,L). :- pred nth_word(+Text, +N, Word) # "true iff @var{Word} is the @var{N}th word of the @var{Text}. Another interesting predicate is @pred{word_number/3}. Given the @var{Word}, return the word number.". nth_word(_Text, 0, "") :- !. nth_word("", _N, "") :- !. nth_word(Text, 1, Word) :- up_to_eow(Text, Word). nth_word(Text, N, Word) :- pos_on_eow(Text, Pos), Pos1 is Pos + 1, take_rest(Text, Pos1, Rest1), trim_begin(Rest1,Rest), N1 is N - 1, nth_word(Rest, N1, Word). :- pred line_number(+Text, +L ,N) # "true iff line string @var{L} is at line number @var{N} in the @var{Text}.". line_number(Text, L, N) :- find_line(Text,L,1,N). :- pred find_line(+Text, +L, +Start, N) # "true iff line string @var{L} is at line number @var{N} searching from line number @var{StartSearch} in the @var{Text}.". find_line(Text,L,StartSearch,StartSearch) :- % Base case: the line is at the starting line. up_to_eol(Text,L),!. find_line([Char|Rest],L,StartSearch,N) :- is_newline(Char), StartSearch2 is StartSearch + 1, find_line(Rest,L,StartSearch2,N). find_line([C|Rest],L,StartSearch,N) :- % Keep going up to the newline character(ignore the actual line). \+ is_newline(C), find_line(Rest,L,StartSearch,N). :- pred take_rest(+Text, +Pos, Rest) # "true iff @var{Rest} is the string that begins from position @var{Pos} up to the end.". take_rest("", 1, "") :- !. take_rest(Text, 1 , Text) :- !. take_rest([_C|R], Pos, Rest) :- Pos2 is Pos - 1, take_rest(R, Pos2, Rest). :- pred take_begin(+Text, +Pos, Begin) # "true iff @var{Begin} is the first part of the @var{Text} up to @var{Pos} position. @var{Pos} can be understood as a Length of the begining of the @var{Text} you want. For example: @tt{take_begin(\"Hello word, how are you?\", 10, \"Hello word\").} is true.". take_begin(_Text, 0, "") :- !. take_begin([C|Rest], Pos, [C|Rest2]) :- Pos1 is Pos - 1, take_begin(Rest, Pos1, Rest2). :- pred next_line(+Text, +ActualPos, LinePos) # "true iff @var{LinePos} is the position of the next line starting from the @var{ActualPos} position.". next_line(Text, ActualPos, LinePos) :- take_rest(Text,ActualPos, Rest), %% We take out the begining up to ActualPos next_line2(Rest, ActualPos, LinePos). :- pred next_line2(+Text, +ActualPos, LinePos) # "true iff @var{LinePos} is the number @var{ActualPos} + Amount_of_characters_up_to_EOL. This start from the begin of the @var{Text} counting up to the EOL.". next_line2("", _ActualPos, 0) :- !. % Base case: no text! next_line2([C|_Text], ActualPos, ActualPos1) :- % Base case: this is the next line. is_newline(C),!, ActualPos1 is ActualPos + 1. next_line2([C|Rest], ActualPos, LinePos) :- \+ is_newline(C), NextPos is ActualPos + 1, next_line2(Rest, NextPos, LinePos). :- pred split_string(+Text, +Separator, LstResult) # "Split the @var{Text} into varios strings according to the @var{Separator} String. For example: @tt{split_string(\"Hello word, how are you?\", \" \", [\"Hello\", \"word,\", \"how\", \"are\", \"you?\"]} is true". split_string(Text, "", Text) :- !. split_string([], _Separator, []) :- !. split_string(Text, Separator, [Text]) :- \+ list_concat([_, Separator, _], Text),!. split_string(Text, Separator, [Elt|LstResult]) :- list_concat([Elt, Separator, Rest], Text),!, split_string(Rest, Separator, LstResult). :- pred substr(+Text, BeginPos, Length, Substring) # "true iff @var{Substring} is the string that is inside @var{Text} from the @var{BeginPos} up to @var{BeginPos} + @var{Length}. This is useful for extracting string from a text.". substr(Text, BeginPos, Length, Substring) :- take_rest(Text, BeginPos, S1), take_begin(S1, Length, Substring). :- pred count_words(+Text, Amount) # "true iff @var{Amount} is the amount of words that @var{Text} has.". count_words("", 0) :- !. count_words(Text, Amount) :- is_whitespace(C), split_string(Text, [C], L), length(L, Amount),!. :- pred trim_begin(+Text, Result) # "true iff @var{Result} is @var{Text} with no whitespaces at the begining.". trim_begin("", "") :- !. trim_begin([C|Rest], Rest1) :- is_whitespace(C), trim_begin(Rest, Rest1). trim_begin([C|Text], [C|Text]) :- \+ is_whitespace(C),!. :- pred trim_end(+Text, Result) # "true iff @var{Result} is @var{Text} with no whitespaces at the ending.". trim_end("", "") :- !. trim_end(Rest, "") :- only_whitespaces(Rest),!. trim_end([C|Rest], [C|Rest_Trimmed]) :- trim_end(Rest, Rest_Trimmed). :- pred trim(+Text, TrimmedText) # "true iff @var{TrimmedText} has no spaces at the begining and at the end. Is the same as: @tt{trim_end(Text, EndTrimmed), trim_begin(EndTrimmed, TrimmedText).} ". trim(Text, TrimmedText) :- trim_end(Text, EndTrimmed), trim_begin(EndTrimmed, TrimmedText),!. :- pred only_whitespaces(+Text) # "true iff @var{Text} is a string conformed only by whitespaces.". only_whitespaces("") :- !. only_whitespaces([C|Rest]) :- is_whitespace(C), only_whitespaces(Rest). :- pred begin_with(+Text, Begin) # "true iff @var{Text} begin with the string @var{Begin}.". begin_with(_Text, ""). begin_with([C|Text], [C|Rest]) :- begin_with(Text, Rest). :- pred end_with(+Text, Ending) # "true iff the ending of the @var{Text} is @var{Ending}.". end_with(Text, Ending) :- list_concat([_A, Ending, []], Text). :- pred substring_pos(+Text, +Substring, Pos) # "true iff the first ocurrence of @var{Substring} is at position @var{Pos} in the @var{Text}.". substring_pos(_Text, "", 1) :- !. % redcut. substring_pos(Text, Substring, 1) :- begin_with(Text, Substring),!. % redcut substring_pos([_C|Text], Substring, Pos) :- substring_pos(Text, Substring, Pos1), Pos is Pos1 + 1. :- pred word_pos(+Text, +Word, Pos) # "true iff the position of the @var{Word} is @var{Pos} in @var{Text}. This predicate counts character by character, but @pred{word_number/3} count word by word. This predicate ensures that what you tiped is a word in a text, not only a substring. For example: @tt{substring_pos(\"hello word, how are you?\", \"ell\", _).} will be true, but \"ell\" is not a word in the text.". word_pos("", _Word , 0) :- !. word_pos(Text, Word, 1) :- up_to_eow(Text, Word),!. word_pos(Text, Word, Pos) :- pos_on_eow(Text, Pos1), Pos2 is Pos1 + 2, take_rest(Text, Pos2, Rest), word_pos(Rest, Word, Pos3), Pos is Pos2 + Pos3 - 1. :- pred word_number(+Text, +Word, N) # "true iff the @var{Word} is the @var{N}th word of the @var{Text}. This predicate counts word by word, but @pred{word_pos/3} count character by character(position). Another interesting predicate is @pred{nth_word/3}, this is the inverse: given the number @var{N} return the nth word.". word_number("", _Word , 0) :- !. word_number(Text, Word, 1) :- up_to_eow(Text, Word),!. word_number(Text, Word, N) :- pos_on_eow(Text, Pos1), Pos2 is Pos1 + 2, take_rest(Text, Pos2, Rest), word_number(Rest, Word, N1), N is N1 + 1. :- pred replace_all(+Text, +Str, +Dest, Replaced) # "True iff @var{Replace} is the string with all substrings @var{Str} in @var{Text} replaced with string @var{Dest}.". replace_all("", _Str, _Dest, "") :- !. replace_all(Text, "", _Dest, Text) :- !. replace_all(Text, Str, Dest, Replaced) :- begin_with(Text, Str),!, length(Str, StrLen1), StrLen is StrLen1 + 1, take_rest(Text, StrLen, Rest), replace_all(Rest, Str, Dest, Replaced2), append(Dest, Replaced2, Replaced). replace_all([C|Rest], Str, Dest, [C|RestRep]) :- replace_all(Rest, Str, Dest, RestRep). :- pred replace_first(+Text, +Str, +Dest, Replaced) # "The same as @pred{replace_all/4} but replacing only one ocurrence.". replace_first("", _Str, _Dest, "") :- !. replace_first(Text, "", _Dest, Text) :- !. replace_first(Text, Str, Dest, Replaced) :- begin_with(Text, Str),!, length(Str, StrLen1), StrLen is StrLen1 + 1, take_rest(Text, StrLen, Rest), append(Dest, Rest, Replaced). replace_first([C|Rest], Str, Dest, [C|RestRep]) :- replace_first(Rest, Str, Dest, RestRep). :- pred case_insensitive(+String, String2) # "true iff @var{String} is the same string as @var{String2} but with characters downcased or upcased.". case_insensitive("", "") :- !. case_insensitive([C|Rest], [B|Rest2]) :- upcase_char(C,B), case_insensitive(Rest, Rest2). case_insensitive([C|Rest], [B|Rest2]) :- downcase_char(C,B), case_insensitive(Rest, Rest2). case_insensitive([C|Rest], [C|Rest2]) :- case_insensitive(Rest, Rest2). :- pred upcase_char(+Char, Upcase) # "true iff @var{Upcase} is the same character as @var{Char} but upcased.". upcase_char(Char, Upcase) :- Char >= 97, Char =< 122,!, Upcase is Char - 32. upcase_char(Char, Char) :- !. :- pred downcase_char(+Char, Downcase) # "true iff @var{Downcase} is the same character as @var{Char} but downcased.". downcase_char(Char, Downcase) :- Char >= 65, Char =< 90,!, Downcase is Char + 32. downcase_char(Char, Char) :- !. :- pred upcase_str(+String, UpCased) # "true iff @var{Upcased} is the same string as @var{String} but with all characters upcased.". upcase_str("", "") :- !. upcase_str([C|Rest], [D|Rest2]) :- upcase_char(C, D), upcase_str(Rest, Rest2). :- pred downcase_str(+String, Downcased) # "true iff @var{Downcased} is the same string as @var{String} but with all characters downcased.". downcase_str("", "") :- !. downcase_str([C|Rest], [D|Rest2]) :- downcase_char(C,D), downcase_str(Rest, Rest2).