|
| 1 | +import sys |
| 2 | +import json |
| 3 | +import os |
| 4 | + |
| 5 | + |
| 6 | +def single_process_arxiv_metadata(item, corpus_id): |
| 7 | + # 提取paper_id |
| 8 | + paper_id = item["id"] |
| 9 | + |
| 10 | + # 提取标题 |
| 11 | + title = item["title"] |
| 12 | + |
| 13 | + # 提取摘要 |
| 14 | + abstract = f"<|reference_start|>{item['abstract']}<|reference_end|>" |
| 15 | + |
| 16 | + # 构建作者字符串 |
| 17 | + if "authors_parsed" in item and item["authors_parsed"]: |
| 18 | + authors = [] |
| 19 | + for author in item["authors_parsed"]: |
| 20 | + if author[0] and author[1]: # 如果有姓和名 |
| 21 | + authors.append(f"{author[1]} {author[0]}") |
| 22 | + authors_str = " and ".join(authors) |
| 23 | + else: |
| 24 | + authors_str = item["authors"] |
| 25 | + |
| 26 | + # 确定年份 |
| 27 | + year = "" |
| 28 | + if "versions" in item and item["versions"]: |
| 29 | + version_date = item["versions"][0]["created"] |
| 30 | + # 格式如: "Mon, 30 Apr 2007 20:32:04 GMT" |
| 31 | + import re |
| 32 | + year_match = re.search(r'\d{4}', version_date) |
| 33 | + if year_match: |
| 34 | + year = year_match.group(0) |
| 35 | + |
| 36 | + # 生成引用键 |
| 37 | + # 使用第一个作者的姓氏(如有) |
| 38 | + first_author_surname = "" |
| 39 | + if "authors_parsed" in item and item["authors_parsed"] and item["authors_parsed"][0]: |
| 40 | + first_author_surname = item["authors_parsed"][0][0].lower() |
| 41 | + else: |
| 42 | + # 尝试从原始作者字符串中提取 |
| 43 | + author_parts = item["authors"].split()[0].lower() |
| 44 | + first_author_surname = author_parts |
| 45 | + |
| 46 | + citation_key = f"{first_author_surname}{year}{title.split()[0].lower()}" |
| 47 | + |
| 48 | + # 构建BibTeX |
| 49 | + categories = item.get("categories", "").replace(" ", ".") |
| 50 | + primary_class = categories.split()[0] if categories else "" |
| 51 | + secondary_classes = " ".join(categories.split()[1:]) if len(categories.split()) > 1 else "" |
| 52 | + |
| 53 | + bibtex = f"@article{{{citation_key},\n" |
| 54 | + bibtex += f" title={{{title}}},\n" |
| 55 | + bibtex += f" author={{{authors_str}}},\n" |
| 56 | + bibtex += f" journal={{arXiv preprint arXiv:{paper_id}}},\n" |
| 57 | + bibtex += f" year={{{year}}},\n" |
| 58 | + bibtex += f" archivePrefix={{arXiv}},\n" |
| 59 | + bibtex += f" eprint={{{paper_id}}},\n" |
| 60 | + |
| 61 | + if primary_class: |
| 62 | + if secondary_classes: |
| 63 | + bibtex += f" primaryClass={{{primary_class} {secondary_classes}}}\n" |
| 64 | + else: |
| 65 | + bibtex += f" primaryClass={{{primary_class}}}\n" |
| 66 | + |
| 67 | + bibtex += "}" |
| 68 | + |
| 69 | + # 构建输出 |
| 70 | + result = { |
| 71 | + "corpus_id": corpus_id, |
| 72 | + "paper_id": paper_id, |
| 73 | + "title": title, |
| 74 | + "abstract": abstract, |
| 75 | + "source": "arxiv", |
| 76 | + "bibtex": bibtex, |
| 77 | + "citation_key": citation_key |
| 78 | + } |
| 79 | + |
| 80 | + return result |
| 81 | + |
| 82 | + |
| 83 | +def process_meta_data(input_meta_file, output_corpus_file): |
| 84 | + count = 0 |
| 85 | + output_corpus_data = [] |
| 86 | + with open(input_meta_file, "r") as fi: |
| 87 | + for line in fi: |
| 88 | + curr_meta_data = json.loads(line) |
| 89 | + corpus_id = f"arxiv-{str(count)}" |
| 90 | + curr_corpus_item = single_process_arxiv_metadata(curr_meta_data, corpus_id) |
| 91 | + output_corpus_data.append(curr_corpus_item) |
| 92 | + count += 1 |
| 93 | + if count > 1000: |
| 94 | + break |
| 95 | + with open(output_corpus_file, "w") as fo: |
| 96 | + for each in output_corpus_data: |
| 97 | + fo.write(json.dumps(each) + "\n") |
| 98 | + |
| 99 | + |
| 100 | +if __name__ == "__main__": |
| 101 | + if len(sys.argv) < 2: |
| 102 | + print("usage: python script.py <input file path> [output file path]") |
| 103 | + sys.exit(1) |
| 104 | + |
| 105 | + input_file = sys.argv[1] |
| 106 | + output_file = sys.argv[2] |
| 107 | + |
| 108 | + process_meta_data(input_file, output_file) |
| 109 | + |
| 110 | + |
| 111 | + |
| 112 | + |
0 commit comments