ebook-treasure-chest/scripts/parse_md_to_json.py at main · sony9154/ebook-treasure-chest · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
解析 md 目錄下的所有 Markdown 檔案，生成統一的 JSON 資料檔案
"""

import json
import re
from pathlib import Path
from collections import defaultdict

ROOT = Path(__file__).parent.parent
MD_DIR = ROOT / "md"
OUTPUT_JSON = ROOT / "docs" / "all-books.json"
STATS_FILE = ROOT / "docs" / "parse-stats.json"


def extract_category_from_file(file_path):
    """從檔案路徑提取分類名"""
    # 從檔名提取（去掉 .md 副檔名）
    category = file_path.stem
    return category


def extract_category_from_content(content):
    """從檔案內容提取分類名（備用方法）"""
    # 查詢 # 分類名 格式
    match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
    if match:
        # 跳過版權宣告，找第二個 # 標題
        lines = content.split('\n')
        for line in lines:
            if line.startswith('# ') and '版權' not in line and '宣告' not in line:
                return line[2:].strip()
    return None


def parse_markdown_table(content):
    """解析 Markdown 表格，提取書籍資訊"""
    books = []
    lines = content.split('\n')

    # 找到表格開始位置（包含 "書名" 的行）
    table_start = -1
    for i, line in enumerate(lines):
        if '| 書名' in line or '書名 |' in line:
            table_start = i
            break

    if table_start == -1:
        return books

    # 跳過表頭分隔行（---）
    data_start = table_start + 2

    # 解析資料行
    for i in range(data_start, len(lines)):
        line = lines[i].strip()
        if not line or not line.startswith('|'):
            continue

        # 解析表格行：| 書名 | 作者 | [下載](連結) |
        # 使用正規表示式提取
        pattern = r'\|\s*(.+?)\s*\|\s*(.+?)\s*\|\s*\[下載\]\((.+?)\)\s*\|'
        match = re.match(pattern, line)

        if match:
            title = match.group(1).strip()
            author = match.group(2).strip()
            link = match.group(3).strip()

            # 清理資料
            title = title.replace('**', '').strip()
            author = author.replace('**', '').strip()

            if title and link:  # 確保有書名和連結
                books.append({
                    'title': title,
                    'author': author if author else '未知',
                    'link': link
                })

    return books


def parse_single_file(file_path):
    """解析單個 Markdown 檔案"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
    except Exception as e:
        print(f"⚠️  讀取檔案失敗 {file_path}: {e}")
        return None, []

    # 提取分類名
    category = extract_category_from_file(file_path)
    category_from_content = extract_category_from_content(content)

    # 優先使用檔案內容中的分類名（更準確）
    if category_from_content and category_from_content != category:
        category = category_from_content

    # 解析表格
    books = parse_markdown_table(content)

    # 為每本書新增分類資訊
    for book in books:
        book['category'] = category
        # 預設值
        book['language'] = 'ZH'  # 預設中文，後續可最佳化
        book['level'] = 'Unknown'
        book['formats'] = ['epub', 'mobi', 'azw3']  # 從表格列名推斷

    return category, books


def main():
    """主函式"""
    print("🚀 開始解析 md 檔案...")

    all_books = []
    category_stats = defaultdict(int)
    total_files = 0
    success_files = 0
    error_files = []

    # 獲取所有 md 檔案
    md_files = list(MD_DIR.glob("*.md"))
    total_files = len(md_files)

    print(f"📁 找到 {total_files} 個 md 檔案")

    # 解析每個檔案
    for i, md_file in enumerate(md_files, 1):
        if i % 100 == 0:
            print(f"⏳ 處理進度: {i}/{total_files} ({i*100//total_files}%)")

        category, books = parse_single_file(md_file)

        if category is None:
            error_files.append(str(md_file))
            continue

        if books:
            all_books.extend(books)
            category_stats[category] = len(books)
            success_files += 1
        else:
            error_files.append(str(md_file))
            print(f"⚠️  未找到資料: {md_file.name}")

    # 儲存結果
    OUTPUT_JSON.parent.mkdir(exist_ok=True)

    print(f"\n📊 解析統計:")
    print(f"  - 總檔案數: {total_files}")
    print(f"  - 成功解析: {success_files}")
    print(f"  - 失敗檔案: {len(error_files)}")
    print(f"  - 總書籍數: {len(all_books)}")
    print(f"  - 分類數量: {len(category_stats)}")

    # 儲存 JSON 檔案
    with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
        json.dump(all_books, f, ensure_ascii=False, indent=2)

    print(f"\n✅ JSON 檔案已生成: {OUTPUT_JSON}")
    print(f"📦 檔案大小: {OUTPUT_JSON.stat().st_size / 1024 / 1024:.2f} MB")

    # 儲存統計資訊
    stats = {
        'total_files': total_files,
        'success_files': success_files,
        'error_files': len(error_files),
        'total_books': len(all_books),
        'categories_count': len(category_stats),
        'top_categories': dict(sorted(category_stats.items(), key=lambda x: x[1], reverse=True)[:20]),
        'error_file_list': error_files[:10]  # 只儲存前10個錯誤檔案
    }

    with open(STATS_FILE, 'w', encoding='utf-8') as f:
        json.dump(stats, f, ensure_ascii=False, indent=2)

    print(f"📈 統計資訊已儲存: {STATS_FILE}")

    # 顯示前10個分類
    print(f"\n🏆 前10個分類（按書籍數量）:")
    for cat, count in sorted(category_stats.items(), key=lambda x: x[1], reverse=True)[:10]:
        print(f"  - {cat}: {count} 本")


if __name__ == "__main__":
    main()