文本对比¶
目录¶
1. python的difflib¶
Python的difflib
模块提供了用于比较序列的函数,特别适用于比较文本行。它可以生成多种格式的差异报告。
1.1 主要功能¶
- SequenceMatcher: 用于比较任意类型的序列对
- Differ: 用于比较文本行序列
- HtmlDiff: 生成HTML格式的差异报告
- unified_diff: 生成统一格式的差异
- context_diff: 生成上下文格式的差异
1.2 基本使用示例¶
简单文本对比¶
import difflib
text1 = """line 1
line 2
line 3
line 4"""
text2 = """line 1
line 2 modified
line 3
line 5"""
# 转换为行列表
lines1 = text1.splitlines(keepends=True)
lines2 = text2.splitlines(keepends=True)
# 生成统一格式差异
diff = difflib.unified_diff(lines1, lines2,
fromfile='original.txt',
tofile='modified.txt',
lineterm='')
for line in diff:
print(line)
字符串相似度比较¶
import difflib
def get_similarity(str1, str2):
"""计算两个字符串的相似度"""
return difflib.SequenceMatcher(None, str1, str2).ratio()
# 示例
text1 = "Hello World"
text2 = "Hello Python"
similarity = get_similarity(text1, text2)
print(f"相似度: {similarity:.2%}") # 输出: 相似度: 54.55%
生成HTML差异报告¶
import difflib
def generate_html_diff(text1, text2, filename1="文件1", filename2="文件2"):
"""生成HTML格式的差异报告"""
lines1 = text1.splitlines()
lines2 = text2.splitlines()
differ = difflib.HtmlDiff()
html_diff = differ.make_file(lines1, lines2, filename1, filename2)
return html_diff
# 示例使用
old_text = """第一行内容
第二行内容
第三行内容"""
new_text = """第一行内容
第二行修改后的内容
第三行内容
新增的第四行"""
html_output = generate_html_diff(old_text, new_text)
# 可以保存到文件中查看
with open('diff_report.html', 'w', encoding='utf-8') as f:
f.write(html_output)
1.3 高级功能¶
文件内容对比¶
import difflib
def compare_files(file1_path, file2_path):
"""比较两个文件的内容"""
with open(file1_path, 'r', encoding='utf-8') as f1:
lines1 = f1.readlines()
with open(file2_path, 'r', encoding='utf-8') as f2:
lines2 = f2.readlines()
# 生成详细的差异报告
differ = difflib.Differ()
diff = list(differ.compare(lines1, lines2))
return diff
# 使用示例
# diff_result = compare_files('file1.txt', 'file2.txt')
# for line in diff_result:
# print(line.rstrip())
查找最相似的匹配¶
import difflib
def find_best_match(query, candidates, cutoff=0.6):
"""在候选列表中找到最相似的匹配项"""
matches = difflib.get_close_matches(query, candidates,
n=3, cutoff=cutoff)
return matches
# 示例
words = ['apple', 'application', 'apply', 'appreciate', 'approach']
query = 'app'
matches = find_best_match(query, words)
print(f"与 '{query}' 最相似的词: {matches}")
2. 其他文本对比工具¶
2.1 Linux命令行工具¶
diff命令¶
# 基本用法
diff file1.txt file2.txt
# 生成统一格式输出
diff -u file1.txt file2.txt
# 忽略空白字符差异
diff -w file1.txt file2.txt
# 递归比较目录
diff -r dir1/ dir2/
vimdiff¶
2.2 在线文本对比工具¶
- Diffchecker: https://www.diffchecker.com/
- Text Compare: http://text-compare.com/
- DiffNow: https://www.diffnow.com/
3. 实际应用场景¶
3.1 代码版本对比¶
import difflib
import os
def compare_code_versions(old_version, new_version):
"""比较代码的两个版本"""
if os.path.isfile(old_version) and os.path.isfile(new_version):
with open(old_version, 'r', encoding='utf-8') as f1:
old_lines = f1.readlines()
with open(new_version, 'r', encoding='utf-8') as f2:
new_lines = f2.readlines()
diff = difflib.unified_diff(
old_lines, new_lines,
fromfile=f'a/{os.path.basename(old_version)}',
tofile=f'b/{os.path.basename(new_version)}',
lineterm=''
)
return '\n'.join(diff)
else:
return "文件不存在"
3.2 配置文件对比¶
import difflib
import json
def compare_json_configs(config1, config2):
"""比较两个JSON配置文件"""
def format_json(data):
return json.dumps(data, indent=2, ensure_ascii=False, sort_keys=True)
json1_str = format_json(config1)
json2_str = format_json(config2)
lines1 = json1_str.splitlines(keepends=True)
lines2 = json2_str.splitlines(keepends=True)
diff = difflib.unified_diff(lines1, lines2,
fromfile='config1.json',
tofile='config2.json',
lineterm='')
return list(diff)
# 示例
config_old = {"name": "app", "version": "1.0", "debug": True}
config_new = {"name": "app", "version": "1.1", "debug": False, "port": 8080}
diff_result = compare_json_configs(config_old, config_new)
for line in diff_result:
print(line.rstrip())
4. 最佳实践¶
4.1 性能优化¶
- 对于大文件,考虑分块处理
- 使用合适的cutoff值来过滤低相似度结果
- 预处理文本(去除空白、标准化格式)
4.2 输出格式选择¶
- 统一格式(unified): 适合代码审查
- 上下文格式(context): 提供更多上下文信息
- HTML格式: 适合可视化展示
- 自定义格式: 根据具体需求定制
4.3 常见问题处理¶
import difflib
def robust_text_compare(text1, text2, ignore_case=False, ignore_whitespace=False):
"""健壮的文本比较函数"""
if ignore_case:
text1 = text1.lower()
text2 = text2.lower()
if ignore_whitespace:
text1 = ' '.join(text1.split())
text2 = ' '.join(text2.split())
similarity = difflib.SequenceMatcher(None, text1, text2).ratio()
return similarity
# 示例
text1 = "Hello World "
text2 = "hello world"
print(f"严格比较: {robust_text_compare(text1, text2):.2%}")
print(f"忽略大小写和空白: {robust_text_compare(text1, text2, True, True):.2%}")