# -*- coding: utf-8 -*- # This file is part of Ascii Emoji. # # Copyright (C) 2021 Arthur Bols # # Ascii Emoji is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Ascii Emoji is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Ascii Emoji. If not, see . import re class AsciiEmoticonsData(dict): def get_regex(self): # When an emoticon is bordered by an alphanumeric character it # is NOT expanded. e.g., foo:) NO, foo :) YES, (BRB) NO, # (:)) YES, etc. We still allow multiple emoticons # side-by-side like :P:P:P keys = sorted(self.keys(), key=len, reverse=True) pre_pattern = '' post_pattern = '' emoticon_length = 0 emoticons_pattern = '' for emoticon in keys: # escape regexp metachars emoticon_escaped = re.escape(emoticon) emoticons_pattern += emoticon_escaped + '|' if emoticon_length != len(emoticon): # Build up expressions to match emoticons next to others pre_pattern = pre_pattern[:-1] + ')|(?<=' post_pattern = post_pattern[:-1] + ')|(?=' emoticon_length = len(emoticon) pre_pattern += emoticon_escaped + '|' post_pattern += emoticon_escaped + '|' # We match from our list of emoticons, but they must either have # whitespace, or another emoticon next to it to match successfully # [\w.] alphanumeric and dot (for not matching 8) in (2.8)) emoticons_pattern = r'(?:(?:P', '\U0001F61c'), ('X-P', '\U0001F61c'), (':(', '\U0001F61e'), (':-(', '\U0001F61e'), (':-[', '\U0001F61e'), (':[', '\U0001F61e'), ('=(', '\U0001F61e'), ('>:[', '\U0001F61e'), (':-O', '\U0001F62e'), (':O', '\U0001F62e'), ('>:O', '\U0001F62e'), ('O_O', '\U0001F62e'), (':)', '\U0001F606'), ('>:-)', '\U0001F606'), ('>;)', '\U0001F606'), ('>=)', '\U0001F606'), ('x-D', '\U0001F606'), ('X-D', '\U0001F606'), ('xD', '\U0001F606'), ('XD', '\U0001F606'), ('0:)', '\U0001F607'), ('0:-)', '\U0001F607'), ('0:-3', '\U0001F607'), ('0:3', '\U0001F607'), ('0;-)', '\U0001F607'), ('0;^)', '\U0001F607'), ('O:)', '\U0001F607'), ('O:-)', '\U0001F607'), ('O:-3', '\U0001F607'), ('O:3', '\U0001F607'), ('O;-)', '\U0001F607'), ('O=)', '\U0001F607'), ('*)', '\U0001F609'), ('*-)', '\U0001F609'), (';)', '\U0001F609'), (';-)', '\U0001F609'), (';-]', '\U0001F609'), (';]', '\U0001F609'), (';^)', '\U0001F609'), (';D', '\U0001F609'), ('-_-', '\U0001F611'), ('-__-', '\U0001F611'), ('-___-', '\U0001F611'), ("':(", '\U0001F613'), ("':-(", '\U0001F613'), ("'=(", '\U0001F613'), (':-.', '\U0001F615'), (':-/', '\U0001F615'), (':/', '\U0001F615'), (':\\', '\U0001F615'), (':L', '\U0001F615'), ('=/', '\U0001F615'), ('=\\', '\U0001F615'), ('=L', '\U0001F615'), ('>:/', '\U0001F615'), ('>:\\', '\U0001F615'), (':*', '\U0001F618'), (':-*', '\U0001F618'), (':^*', '\U0001F618'), ('=*', '\U0001F618'), (':@', '\U0001F620'), ('>:(', '\U0001F620'), ('>:-(', '\U0001F620'), (":'(", '\U0001F622'), (":'-(", '\U0001F622'), (';(', '\U0001F622'), (';-(', '\U0001F622'), ('>.<', '\U0001F623'), ('D:', '\U0001F628'), (':$', '\U0001F633'), ('=$', '\U0001F633'), ('#)', '\U0001F635'), ('#-)', '\U0001F635'), ('%)', '\U0001F635'), ('%-)', '\U0001F635'), ('X)', '\U0001F635'), ('X-)', '\U0001F635'), (':#', '\U0001F636'), (':-#', '\U0001F636'), (':-X', '\U0001F636'), (':X', '\U0001F636'), ('=#', '\U0001F636'), ('=X', '\U0001F636'), (':)', '\U0001F642'), (':-)', '\U0001F642'), (':]', '\U0001F642'), ('=)', '\U0001F642'), ('=]', '\U0001F642'), ('*\\0/*', '\U0001F646'), ('*\\O/*', '\U0001F646'), ('\\0/', '\U0001F646'), ('\\O/', '\U0001F646'), ('<3', '\U00002764\U0000FE0F'), ])