You can convert text to unicode and use the unicode regex pattern.
Here are the three characters you mentioned:
In [213]: print(u'\N{COPYRIGHT SIGN} \N{TRADE MARK SIGN} \N{REGISTERED SIGN}')
© ™ ®
Here is some utf-8 encoding line:
In [223]: content = u'\N{TRADE MARK SIGN}'.encode('utf-8')
Here we convert it to unicode:
In [224]: text = content.decode('utf-8')
This is a regular search for any of the three characters:
In [225]: re.search(u'(\N{COPYRIGHT SIGN}|\N{TRADE MARK SIGN}|\N{REGISTERED SIGN})', text)
Out[225]: <_sre.SRE_Match at 0x9a1ebe0>
There are web pages that catalog each Unicode character. But there are hundreds of thousands of assigned Unicode codes, so it is not possible to search for characters by looking through them manually.
So, I wrote the program below to search by name for Unicode.
import sys
import unicodedata as ud
import re
import argparse
import functools
__usage__ = '''\
unicode_lookup.py -u '\d' # Shows all unicode symbols that regex match '\d'
unicode_lookup.py number # Shows all unicode symbols whose name regex matches 'number'
'''
def lookup(name_pat=None, from_num=0, to_num=0x10ffff, unicode_pattern=None,
category_pattern=None, ignore_unnamed=True,
combining=False):
fmt = u"{symbol} {num} {cat} {bi} {w} {comb} {mir} '{name}'"
print(fmt.format(
symbol='Symbol', num='Num', name='NAME',
cat='Category', bi='Bidirectional', w='Width',
comb='Combining', mir='Mirrored'))
for num in range(from_num, to_num + 1):
s = unichr(num)
if unicode_pattern and not unicode_pattern.match(s):
continue
category = ud.category(s)
if category_pattern and not category_pattern.match(category):
continue
try:
name = ud.name(s)
if name_pat and not name_pat.search(name):
continue
except ValueError:
if ignore_unnamed:
continue
else:
name = '?'
bidirectional = ud.bidirectional(s)
combining_class = ud.combining(s)
if combining and not combining_class:
continue
mirrored = ud.mirrored(s)
width = ud.east_asian_width(s)
data = dict(num=num, symbol=s, name=name,
cat=category, bi=bidirectional, w=width,
comb=combining_class, mir=mirrored)
print(fmt.format(**data).encode('utf-8'))
def parse_options():
parser = argparse.ArgumentParser(
epilog=__usage__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('name_pat',
type=functools.partial(re.compile, flags=re.IGNORECASE))
parser.add_argument('-f', '--from_num', default=0, type=int)
parser.add_argument('-t', '--to_num', default=0x10ffff, type=int)
parser.add_argument('-u', '--unicode_pattern',
type=functools.partial(re.compile, flags=re.UNICODE))
parser.add_argument('--category_pattern', type=re.compile)
parser.add_argument('--show_unnamed', action='store_true')
parser.add_argument('--combining', action='store_true')
return parser.parse_args()
if __name__ == '__main__':
opt = parse_options()
lookup(name_pat=opt.name_pat, from_num=opt.from_num, to_num=opt.to_num,
unicode_pattern=opt.unicode_pattern,
category_pattern=opt.category_pattern,
ignore_unnamed=not opt.show_unnamed,
combining=opt.combining)
Launch
% unicode_lookup.py "copyright|trade|registered"
gives
Symbol Num Category Bidirectional Width Combining Mirrored 'NAME'
© 169 So ON N 0 0 'COPYRIGHT SIGN'
® 174 So ON A 0 0 'REGISTERED SIGN'
℗ 8471 So ON N 0 0 'SOUND RECORDING COPYRIGHT'
™ 8482 So ON A 0 0 'TRADE MARK SIGN'