#!coding: utf-8 import sys import re ngram = 4 def file2array(file): txt = re.sub(u"[\?\+‑\.!,;–()'’-]", " ", file.read().decode('UTF-8')) txt = txt.replace('|', ' ') txt = txt.replace(u'’', '\'') txt = re.sub('\n+', '
', txt) txt = re.sub(u' *([«"»:;]) *', r' " ', txt) txt = re.sub('L(\d)', r'L \1', txt) return [mot for mot in re.split("\s+", txt) if mot] def printHighlight(mots, match, withnbmots=""): html = '' nb = 0 max = 0 for i, mot in enumerate(mots): if match[i]: html += '' nb += 1 else: html += ''; if nb > max: max = nb nb = 0 html += mot html += ' '; html = '

'+re.sub('
', '

', html)+'

' print(html.encode('UTF-8')) if (withnbmots): sys.stderr.write(str(max)+";"+withnbmots + "\n") with open(sys.argv[1], 'r') as f: mots1 = file2array(f) with open(sys.argv[2], 'r') as f: mots2 = file2array(f) if len(sys.argv) > 3 : ngram = int(sys.argv[3]) match1 = [0] * len(mots1) match2 = [0] * len(mots2) for i in range(0, len(mots1) - ngram): for y in range(0, len(mots2) - ngram): if re.match(' '.join(mots1[i:i+ngram]), ' '.join(mots2[y:y+ngram]), re.I): match1[i:i+ngram] = [1] * ngram match2[y:y+ngram] = [1] * ngram print('''

Document d'origine

''') printHighlight(mots1, match1) print('''

Document similaire

''') printHighlight(mots2, match2, sys.argv[2]) print('''
''')