跳转至

文本对比

目录

1. python的difflib

Python的difflib模块提供了用于比较序列的函数,特别适用于比较文本行。它可以生成多种格式的差异报告。

1.1 主要功能

  • SequenceMatcher: 用于比较任意类型的序列对
  • Differ: 用于比较文本行序列
  • HtmlDiff: 生成HTML格式的差异报告
  • unified_diff: 生成统一格式的差异
  • context_diff: 生成上下文格式的差异

1.2 基本使用示例

简单文本对比

import difflib

text1 = """line 1
line 2
line 3
line 4"""

text2 = """line 1
line 2 modified
line 3
line 5"""

# 转换为行列表
lines1 = text1.splitlines(keepends=True)
lines2 = text2.splitlines(keepends=True)

# 生成统一格式差异
diff = difflib.unified_diff(lines1, lines2, 
                          fromfile='original.txt', 
                          tofile='modified.txt', 
                          lineterm='')

for line in diff:
    print(line)

字符串相似度比较

import difflib

def get_similarity(str1, str2):
    """计算两个字符串的相似度"""
    return difflib.SequenceMatcher(None, str1, str2).ratio()

# 示例
text1 = "Hello World"
text2 = "Hello Python"
similarity = get_similarity(text1, text2)
print(f"相似度: {similarity:.2%}")  # 输出: 相似度: 54.55%

生成HTML差异报告

import difflib

def generate_html_diff(text1, text2, filename1="文件1", filename2="文件2"):
    """生成HTML格式的差异报告"""
    lines1 = text1.splitlines()
    lines2 = text2.splitlines()

    differ = difflib.HtmlDiff()
    html_diff = differ.make_file(lines1, lines2, filename1, filename2)

    return html_diff

# 示例使用
old_text = """第一行内容
第二行内容
第三行内容"""

new_text = """第一行内容
第二行修改后的内容
第三行内容
新增的第四行"""

html_output = generate_html_diff(old_text, new_text)
# 可以保存到文件中查看
with open('diff_report.html', 'w', encoding='utf-8') as f:
    f.write(html_output)

1.3 高级功能

文件内容对比

import difflib

def compare_files(file1_path, file2_path):
    """比较两个文件的内容"""
    with open(file1_path, 'r', encoding='utf-8') as f1:
        lines1 = f1.readlines()

    with open(file2_path, 'r', encoding='utf-8') as f2:
        lines2 = f2.readlines()

    # 生成详细的差异报告
    differ = difflib.Differ()
    diff = list(differ.compare(lines1, lines2))

    return diff

# 使用示例
# diff_result = compare_files('file1.txt', 'file2.txt')
# for line in diff_result:
#     print(line.rstrip())

查找最相似的匹配

import difflib

def find_best_match(query, candidates, cutoff=0.6):
    """在候选列表中找到最相似的匹配项"""
    matches = difflib.get_close_matches(query, candidates, 
                                      n=3, cutoff=cutoff)
    return matches

# 示例
words = ['apple', 'application', 'apply', 'appreciate', 'approach']
query = 'app'
matches = find_best_match(query, words)
print(f"与 '{query}' 最相似的词: {matches}")

2. 其他文本对比工具

2.1 Linux命令行工具

diff命令

# 基本用法
diff file1.txt file2.txt

# 生成统一格式输出
diff -u file1.txt file2.txt

# 忽略空白字符差异
diff -w file1.txt file2.txt

# 递归比较目录
diff -r dir1/ dir2/

vimdiff

# 使用vim进行可视化对比
vimdiff file1.txt file2.txt

2.2 在线文本对比工具

  • Diffchecker: https://www.diffchecker.com/
  • Text Compare: http://text-compare.com/
  • DiffNow: https://www.diffnow.com/

3. 实际应用场景

3.1 代码版本对比

import difflib
import os

def compare_code_versions(old_version, new_version):
    """比较代码的两个版本"""
    if os.path.isfile(old_version) and os.path.isfile(new_version):
        with open(old_version, 'r', encoding='utf-8') as f1:
            old_lines = f1.readlines()
        with open(new_version, 'r', encoding='utf-8') as f2:
            new_lines = f2.readlines()

        diff = difflib.unified_diff(
            old_lines, new_lines,
            fromfile=f'a/{os.path.basename(old_version)}',
            tofile=f'b/{os.path.basename(new_version)}',
            lineterm=''
        )

        return '\n'.join(diff)
    else:
        return "文件不存在"

3.2 配置文件对比

import difflib
import json

def compare_json_configs(config1, config2):
    """比较两个JSON配置文件"""
    def format_json(data):
        return json.dumps(data, indent=2, ensure_ascii=False, sort_keys=True)

    json1_str = format_json(config1)
    json2_str = format_json(config2)

    lines1 = json1_str.splitlines(keepends=True)
    lines2 = json2_str.splitlines(keepends=True)

    diff = difflib.unified_diff(lines1, lines2, 
                              fromfile='config1.json',
                              tofile='config2.json',
                              lineterm='')

    return list(diff)

# 示例
config_old = {"name": "app", "version": "1.0", "debug": True}
config_new = {"name": "app", "version": "1.1", "debug": False, "port": 8080}

diff_result = compare_json_configs(config_old, config_new)
for line in diff_result:
    print(line.rstrip())

4. 最佳实践

4.1 性能优化

  • 对于大文件,考虑分块处理
  • 使用合适的cutoff值来过滤低相似度结果
  • 预处理文本(去除空白、标准化格式)

4.2 输出格式选择

  • 统一格式(unified): 适合代码审查
  • 上下文格式(context): 提供更多上下文信息
  • HTML格式: 适合可视化展示
  • 自定义格式: 根据具体需求定制

4.3 常见问题处理

import difflib

def robust_text_compare(text1, text2, ignore_case=False, ignore_whitespace=False):
    """健壮的文本比较函数"""
    if ignore_case:
        text1 = text1.lower()
        text2 = text2.lower()

    if ignore_whitespace:
        text1 = ' '.join(text1.split())
        text2 = ' '.join(text2.split())

    similarity = difflib.SequenceMatcher(None, text1, text2).ratio()
    return similarity

# 示例
text1 = "Hello World  "
text2 = "hello world"
print(f"严格比较: {robust_text_compare(text1, text2):.2%}")
print(f"忽略大小写和空白: {robust_text_compare(text1, text2, True, True):.2%}")