Skip to content

Commit f9114f0

Browse files
committed
add
1 parent 33ee7d2 commit f9114f0

3 files changed

Lines changed: 126 additions & 0 deletions

File tree

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,20 @@ bash download.sh
4848
bash run_demo.sh
4949
```
5050

51+
## Update new papers to the corpus
52+
To update your corpus with the latest papers, follow these steps:
53+
54+
1. Download the most recent arXiv metadata from Kaggle and save it to your chosen ARXIV_META_DATA_PATH
55+
2. Run the data processing script:
56+
```bash
57+
cd utils/
58+
python process_arxiv_meta_data.py ARXIV_META_DATA_PATH ../data/corpus_data_arxiv_1215.jsonl
59+
```
60+
3. Generate the embedding of the corpus
61+
```bash
62+
bash encode_corpus.sh
63+
```
64+
5165
## 📖 Demo Video
5266

5367
[![Scholar Copilot Demo Video](https://img.youtube.com/vi/QlY7S52sWDA/maxresdefault.jpg)](https://www.youtube.com/watch?v=QlY7S52sWDA)

utils/encode_corpus.sh

Whitespace-only changes.

utils/process_arxiv_meta_data.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import sys
2+
import json
3+
import os
4+
5+
6+
def single_process_arxiv_metadata(item, corpus_id):
7+
# 提取paper_id
8+
paper_id = item["id"]
9+
10+
# 提取标题
11+
title = item["title"]
12+
13+
# 提取摘要
14+
abstract = f"<|reference_start|>{item['abstract']}<|reference_end|>"
15+
16+
# 构建作者字符串
17+
if "authors_parsed" in item and item["authors_parsed"]:
18+
authors = []
19+
for author in item["authors_parsed"]:
20+
if author[0] and author[1]: # 如果有姓和名
21+
authors.append(f"{author[1]} {author[0]}")
22+
authors_str = " and ".join(authors)
23+
else:
24+
authors_str = item["authors"]
25+
26+
# 确定年份
27+
year = ""
28+
if "versions" in item and item["versions"]:
29+
version_date = item["versions"][0]["created"]
30+
# 格式如: "Mon, 30 Apr 2007 20:32:04 GMT"
31+
import re
32+
year_match = re.search(r'\d{4}', version_date)
33+
if year_match:
34+
year = year_match.group(0)
35+
36+
# 生成引用键
37+
# 使用第一个作者的姓氏(如有)
38+
first_author_surname = ""
39+
if "authors_parsed" in item and item["authors_parsed"] and item["authors_parsed"][0]:
40+
first_author_surname = item["authors_parsed"][0][0].lower()
41+
else:
42+
# 尝试从原始作者字符串中提取
43+
author_parts = item["authors"].split()[0].lower()
44+
first_author_surname = author_parts
45+
46+
citation_key = f"{first_author_surname}{year}{title.split()[0].lower()}"
47+
48+
# 构建BibTeX
49+
categories = item.get("categories", "").replace(" ", ".")
50+
primary_class = categories.split()[0] if categories else ""
51+
secondary_classes = " ".join(categories.split()[1:]) if len(categories.split()) > 1 else ""
52+
53+
bibtex = f"@article{{{citation_key},\n"
54+
bibtex += f" title={{{title}}},\n"
55+
bibtex += f" author={{{authors_str}}},\n"
56+
bibtex += f" journal={{arXiv preprint arXiv:{paper_id}}},\n"
57+
bibtex += f" year={{{year}}},\n"
58+
bibtex += f" archivePrefix={{arXiv}},\n"
59+
bibtex += f" eprint={{{paper_id}}},\n"
60+
61+
if primary_class:
62+
if secondary_classes:
63+
bibtex += f" primaryClass={{{primary_class} {secondary_classes}}}\n"
64+
else:
65+
bibtex += f" primaryClass={{{primary_class}}}\n"
66+
67+
bibtex += "}"
68+
69+
# 构建输出
70+
result = {
71+
"corpus_id": corpus_id,
72+
"paper_id": paper_id,
73+
"title": title,
74+
"abstract": abstract,
75+
"source": "arxiv",
76+
"bibtex": bibtex,
77+
"citation_key": citation_key
78+
}
79+
80+
return result
81+
82+
83+
def process_meta_data(input_meta_file, output_corpus_file):
84+
count = 0
85+
output_corpus_data = []
86+
with open(input_meta_file, "r") as fi:
87+
for line in fi:
88+
curr_meta_data = json.loads(line)
89+
corpus_id = f"arxiv-{str(count)}"
90+
curr_corpus_item = single_process_arxiv_metadata(curr_meta_data, corpus_id)
91+
output_corpus_data.append(curr_corpus_item)
92+
count += 1
93+
if count > 1000:
94+
break
95+
with open(output_corpus_file, "w") as fo:
96+
for each in output_corpus_data:
97+
fo.write(json.dumps(each) + "\n")
98+
99+
100+
if __name__ == "__main__":
101+
if len(sys.argv) < 2:
102+
print("usage: python script.py <input file path> [output file path]")
103+
sys.exit(1)
104+
105+
input_file = sys.argv[1]
106+
output_file = sys.argv[2]
107+
108+
process_meta_data(input_file, output_file)
109+
110+
111+
112+

0 commit comments

Comments
 (0)