feat:重构代码

This commit is contained in:
zyronon
2025-07-13 03:37:18 +08:00
parent 97c0c47746
commit 410f43e0c9
8 changed files with 270 additions and 107657 deletions

View File

@@ -1,3 +0,0 @@
{"word":"tow","phonetic0":"/ təʊ /","phonetic1":"/ toʊ /","trans":[{"pos":"v.","cn":"拖,牵引(车、船等);(人)牵引,拖带(某人,某物)"},{"pos":"n.","cn":"(对车船等的)牵引,拖;牵引绳,拖链"},{"pos":"","cn":"【名】 Tow人名"}],"sentences":[{"v":"They threatened to tow away my car.","tran":"他们威胁说要拖走我的车。"},{"v":"He had been using the vehicle to tow his work trailer..","tran":"他一直用那辆车来拖他的活动工作室。"},{"v":"The car broke down and we had to get somebody to give us a tow.","tran":"汽车抛锚了,我们只得让人拖走。"}],"phrases":[{"v":"in tow","tran":"拖着;在一起"},{"v":"tow truck","tran":"n. 拖车;牵引车"},{"v":"on tow","tran":"(车辆等)被拖(带)着"}],"synos":[{"pos":"n.","tran":"拖;[纺]麻的粗纤维;拖曳所用之绳","ws":["pull","drag"]},{"pos":"vt.","tran":"拖;牵引;曳","ws":["pull","trail"]}],"relWords":[],"memory":""},
{"word":"toward","phonetic0":"/ təˈːd /","phonetic1":"/ tɔːrd /","trans":[{"pos":"prep.","cn":"向趋向对于接近时间靠近用于为了同towards"},{"pos":"adj.","cn":"即将来到的,进行中的"},{"pos":"n.","cn":"Toward美、加、沙特特沃德"}],"sentences":[{"v":"She turned back toward home.","tran":"她折了回来往家走去。"},{"v":"I took a step toward him.","tran":"我朝他迈出了一步。"},{"v":"He headed downhill toward the river.","tran":"他朝山脚下的小河走去。"}],"phrases":[{"v":"go toward","tran":"朝...方向走去"},{"v":"make toward","tran":"向…延伸;向…前进"}],"synos":[{"pos":"prep.","tran":"向;对于;为了;接近","ws":["unto","upon","out"]},{"pos":"adj.","tran":"即将来到的,进行中的","ws":["going","underway"]}],"relWords":[],"memory":""},
{"word":"take","phonetic0":"/ teɪk /","phonetic1":"/ teɪk /","trans":[{"pos":"v.","cn":"携带,拿走;带去,引领;使达到,提升;拿,取;移走,拿开;偷走,误拿;取材于,收集;攻占,控制;选中,买下;订阅(报纸等);吃,服用;减去;记录,摘录;照相,摄影;量取,测定;就(座);以…...为例;接受,收取;接纳,接待(顾客、患者等);遭受,经受;忍受,容忍;(以某种方式)对待,处理;理解,考虑;误以为;赢得(比赛、竞赛等);产生(感情),持有(看法);采取(措施),采用(方法);做,拥有;采用(形式),就任(职位);花费,占用(时间); 需要,要求;使用;穿(特定尺码的鞋或衣物);容纳;授课;学习,选修(课程);参加(考试或测验);走(路线),乘坐(交通工具);跨过,跳过;踢,掷;举行投票,进行民意调查;成功,奏效;(语法)需带有(某种结构)"},{"pos":"n.","cn":"(一次拍摄的)镜头,场景;收入量;看法,态度;(印刷)一次排版量"}],"sentences":[{"v":"I'll take my coat upstairs. Shall I take yours, Roberta?","tran":"我将把我的外套拿到楼上去。要我把你的拿上去吗,罗伯塔?"},{"v":"She can't take criticism.","tran":"她受不了批评。"},{"v":"We take the 'Express'.","tran":"我们订阅的是《快报》。"}],"phrases":[{"v":"take some","tran":"不大容易"},{"v":"take care of oneself","tran":"照顾自己;颐养"},{"v":"take part","tran":"参与, 参加"}],"synos":[{"pos":"vt.","tran":"拿,取;采取;吃;接受","ws":["carry","adopt","have","eat","assume"]},{"pos":"vi.","tran":"拿;获得","ws":["pick up","get access to"]}],"relWords":[],"memory":""},

View File

@@ -1,28 +1,45 @@
const fs = require('fs');
const path = require('path');
const { chromium } = require('playwright');
const {chromium} = require('playwright');
const SOURCE_DIR = path.join(__dirname, 'source');
const RESULT_DIR = path.join(__dirname, 'result');
const TOTAL_RESULT_FILE = path.join(__dirname, 'all.json');
const TOTAL_RESULT_FILE = path.join(__dirname + '/save/', 'all.json');
const FAILED_FILE = path.join(__dirname + '/save/', 'failed.json');
const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
const MAX_COUNT = 3; // ✅ 设定最大爬取数(调试用)
const MAX_COUNT = 1;
let crawlCount = 0;
const allResults = [];
const failedWords = [];
const existingMap = new Map();
// 创建 result 目录(如无)
if (!fs.existsSync(RESULT_DIR)) {
fs.mkdirSync(RESULT_DIR);
}
// 追加写入总文件
// ✅ 加载已爬数据(增量去重)
if (fs.existsSync(TOTAL_RESULT_FILE)) {
const lines = fs.readFileSync(TOTAL_RESULT_FILE, 'utf-8').split('\n').filter(Boolean);
for (const line of lines) {
try {
const obj = JSON.parse(line);
if (obj?.word) {
existingMap.set(obj.word.toLowerCase(), obj);
}
} catch {
}
}
console.log(`📦 已加载 ${existingMap.size} 个已爬词`);
}
// ✅ 立即写入 all.json
function appendToAll(result) {
fs.appendFileSync(TOTAL_RESULT_FILE, JSON.stringify(result) + ',\n', 'utf-8');
fs.appendFileSync(TOTAL_RESULT_FILE, JSON.stringify(result) + '\n', 'utf-8');
}
async function crawlWord(word, page, retry = 0) {
word = 'private'
const data = {
word: word,
phonetic0: '',
@@ -31,25 +48,27 @@ async function crawlWord(word, page, retry = 0) {
sentences: [],
phrases: [],
synos: [],
relWords: [],
memory: '',
relWords: {
root: '',
rels: []
},
etymology: [],
};
const url = `https://www.youdao.com/result?word=${encodeURIComponent(word)}&lang=en`;
try {
await page.goto(url, { waitUntil: 'networkidle', timeout: 15000 });
await page.goto(url, {waitUntil: 'networkidle', timeout: 15000});
// word
const titleEl = await page.locator('.title').first();
data.word = await titleEl.evaluate(el => el.firstChild?.nodeValue || '');
// phonetic
const phones = await page.$$('.per-phone .phonetic');
if (phones[0]) data.phonetic0 = (await phones[0].textContent())?.trim() || '';
if (phones[1]) data.phonetic1 = (await phones[1].textContent())?.trim() || '';
data.phonetic0 = data.phonetic0.replaceAll('/', '').trim()
data.phonetic1 = data.phonetic1.replaceAll('/', '').trim()
// trans
const trans = await page.$$('.basic .word-exp');
for (const el of trans) {
const pos = await el.$('.pos');
@@ -60,32 +79,29 @@ async function crawlWord(word, page, retry = 0) {
});
}
// sentences
const sentList = await page.$$('.blng_sents_part .trans-container ul li .col2');
for (const el of sentList) {
const en = await el.$('.sen-eng');
const ch = await el.$('.sen-ch');
data.sentences.push({
v: en ? (await en.textContent())?.trim() : '',
tran: ch ? (await ch.textContent())?.trim() : '',
c: en ? (await en.textContent())?.trim() : '',
cn: ch ? (await ch.textContent())?.trim() : '',
});
}
// phrases
const phrs = await page.$$('.phrs ul li .phrs-content');
for (const el of phrs) {
const point = await el.$('.point');
const tran = await el.$('.phr_trans');
data.phrases.push({
v: point ? (await point.textContent())?.trim() : '',
tran: tran ? (await tran.textContent())?.trim() : '',
c: point ? (await point.textContent())?.trim() : '',
cn: tran ? (await tran.textContent())?.trim() : '',
});
}
// 同义词optional
try {
await page.getByText('同近义词', { timeout: 2000 }).click();
await page.waitForSelector('.syno', { timeout: 3000 });
await page.getByText('同近义词', {timeout: 2000}).click();
await page.waitForSelector('.syno', {timeout: 3000});
const synos = await page.$$('.syno-item');
for (const el of synos) {
const pos = await el.$('.index');
@@ -94,11 +110,59 @@ async function crawlWord(word, page, retry = 0) {
let str = wordEl ? (await wordEl.textContent())?.trim() : '';
data.synos.push({
pos: pos ? (await pos.textContent())?.trim() : '',
tran: tran ? (await tran.textContent())?.trim() : '',
cn: tran ? (await tran.textContent())?.trim() : '',
ws: str.split('/').map(s => s.trim()).filter(Boolean),
});
}
} catch {}
} catch {
}
try {
await page.getByText('同根词', {timeout: 2000}).click();
await page.waitForSelector('.rel_word', {timeout: 3000});
const cigen = await page.$('.trans-container > p .point');
data.relWords.root = cigen ? (await cigen.textContent())?.trim() : ''
const rel_word_item_list = await page.$$('.rel_word_item');
for (const el of rel_word_item_list) {
let item = {
words: []
}
const pos = await el.$('.pos');
item.pos = pos ? (await pos.textContent())?.trim() : ''
const rel_content_list = await el.$$('.rel_content p');
for (const el2 of rel_content_list) {
const word = await el2.$('.point');
let wordStr = word ? (await word.textContent())?.trim() : ''
let str = el2 ? (await el2.textContent())?.trim() : ''
str = str.replace(wordStr, '');
item.words.push({
c: wordStr,
cn: str
})
}
data.relWords.rels.push(item);
}
} catch (e) {
console.log('报错了', e)
}
try {
await page.getByText('词源', {timeout: 2000}).click();
await page.waitForSelector('.etymology', {timeout: 3000});
const trans_cell = await page.$$('.trans-cell');
for (const el of trans_cell) {
const header = await el.$('.header');
const zh_result = await el.$('.zh_result');
data.etymology.push({
t: header ? (await header.textContent())?.trim() : '',
d: zh_result ? (await zh_result.textContent())?.trim() : '',
});
}
} catch {
}
return data;
} catch (err) {
@@ -107,7 +171,8 @@ async function crawlWord(word, page, retry = 0) {
await sleep(1000);
return crawlWord(word, page, retry + 1);
} else {
console.log(`${word} 抓取失败,跳过。`);
console.log(`${word} 抓取失败`);
failedWords.push(word);
return null;
}
}
@@ -115,7 +180,7 @@ async function crawlWord(word, page, retry = 0) {
(async () => {
const files = fs.readdirSync(SOURCE_DIR).filter(f => f.endsWith('.json'));
const browser = await chromium.launch({ headless: true });
const browser = await chromium.launch({headless: true});
const page = await browser.newPage();
for (const file of files) {
@@ -124,21 +189,32 @@ async function crawlWord(word, page, retry = 0) {
const wordList = raw.map(obj => obj.word).filter(Boolean);
const resultForThisFile = [];
console.log(`📂 处理文件:${file},共 ${wordList.length} 个单词`);
for (const word of wordList) {
for (let i = 0; i < wordList.length; i++) {
let word = wordList[i]
const lowerWord = word.toLowerCase();
if (existingMap.has(lowerWord) && false) {
console.log(`⚪ 已爬过 ${word},跳过`);
// 把之前爬取的内容也加入当前文件结果数组
const existData = existingMap.get(lowerWord);
if (existData) {
resultForThisFile.push(existData);
}
continue;
}
if (crawlCount >= MAX_COUNT) {
console.log(`🚫 达到调试上限 ${MAX_COUNT},终止爬取`);
await browser.close();
return;
break;
}
console.log(`爬取:${file}${word},进度:${i} / ${wordList.length}`)
const result = await crawlWord(word, page);
if (result) {
crawlCount++;
appendToAll(result);
resultForThisFile.push(result);
existingMap.set(lowerWord, result);
}
await sleep(500);
@@ -147,10 +223,16 @@ async function crawlWord(word, page, retry = 0) {
const outputName = path.basename(file, '.json') + '_v2.json';
const outputPath = path.join(RESULT_DIR, outputName);
fs.writeFileSync(outputPath, JSON.stringify(resultForThisFile, null, 2), 'utf-8');
console.log(`✅ 已保存:${outputName}`);
}
await browser.close();
// ✅ 保存失败词
if (failedWords.length) {
fs.writeFileSync(FAILED_FILE, JSON.stringify(failedWords, null, 2), 'utf-8');
console.log(`❗ 失败词写入 ${FAILED_FILE}`);
}
console.log('\n🎉 所有任务完成!');
})();

View File

@@ -0,0 +1,121 @@
[
{
"word": "private",
"phonetic0": "ˈpraɪvət",
"phonetic1": "ˈpraɪvət",
"trans": [
{
"pos": "adj.",
"cn": "私有的,自用的;(服务,行业)私营的,民营的;没有股票上市的;私下的,秘密的;(交易未涉及商业机构的)私下里的;(想法、感觉)私密的;未担任公职的,无官职的;与官职(或工作)无关的;安静的,不受打扰的;(人)内敛的,不喜欢谈论私事的;(尤指两人)单独的;私下辅导的;(教育,医疗)私立的;只有自己人明白的"
},
{
"pos": "n.",
"cn": "(美、英陆军或美国海军陆战队)二等兵;<非正式> 私处阴部privates"
}
],
"sentences": [
{
"c": "He has a private income.",
"cn": "他有一笔私人收入。"
},
{
"c": "She now teaches only private pupils.",
"cn": "她现在只当私人教师。"
},
{
"c": "The door is marked \"Private.\"",
"cn": "门上写着“闲人免进”。"
}
],
"phrases": [
{
"c": "in private",
"cn": "私下地;秘密地"
},
{
"c": "private enterprise",
"cn": "民营企业;私营企业"
},
{
"c": "private sector",
"cn": "私营部门;私营成分"
}
],
"synos": [
{
"pos": "adj.",
"cn": "私人的;私有的;私下的",
"ws": [
"chamber",
"cabinet"
]
},
{
"pos": "n.",
"cn": "列兵;二等兵",
"ws": [
"seaman second class"
]
}
],
"relWords": {
"root": "private",
"rels": [
{
"words": [
{
"c": "privately",
"cn": "私下地;秘密地"
}
],
"pos": "adv."
},
{
"words": [
{
"c": "privatization",
"cn": "私有化"
},
{
"c": "privateer",
"cn": "私掠船;武装民船;私掠船船长;私掠船船员"
},
{
"c": "privateness",
"cn": "私人性"
}
],
"pos": "n."
},
{
"words": [
{
"c": "privateer",
"cn": "私掠巡航"
}
],
"pos": "vi."
},
{
"words": [
{
"c": "privatize",
"cn": "使私有化;使归私有"
}
],
"pos": "vt."
}
]
},
"etymology": [
{
"t": "private:私人的,个人的;私下的,秘密的;私营的,民办的",
"d": "词根词缀: -priv-私人,私下 + -ate形容词词尾"
},
{
"t": "private:个人的,私有的",
"d": "来自拉丁语privare,使个人化使分开使剥离来自privus,个人的自己的来自PIE*per,向前穿过词源同ford,first."
}
]
}
]

File diff suppressed because it is too large Load Diff

View File

@@ -1,136 +0,0 @@
[
{
"word": "expect",
"synos": [
{
"pos": "vt.",
"tran": "期望;指望;认为;预料",
"ws": [
"promise oneself",
"guess",
"find",
"feel",
"make"
]
},
{
"pos": "vi.",
"tran": "期待;预期",
"ws": [
"look foward to",
"to look forward to"
]
}
]
},
{
"word": "run",
"synos": [
{
"pos": "vi.",
"tran": "经营;奔跑;运转",
"ws": [
"go",
"fare"
]
},
{
"pos": "vt.",
"tran": "管理,经营;运行;参赛",
"ws": [
"conduct",
"direct",
"control",
"supervise",
"operate"
]
},
{
"pos": "n.",
"tran": "奔跑;赛跑;趋向;奔跑的路程",
"ws": [
"footrace",
"tendency to sth"
]
}
]
},
{
"word": "happy",
"synos": [
{
"pos": "adj.",
"tran": "幸福的;高兴的;巧妙的",
"ws": [
"pleased",
"glad",
"blessed",
"smart"
]
}
]
},
{
"word": "blue",
"synos": [
{
"pos": "adj.",
"tran": "[光]蓝色的;忧郁的,沮丧的;下流的",
"ws": [
"dark",
"disappointed",
"dirty",
"depressed"
]
},
{
"pos": "n.",
"tran": "[光]蓝色",
"ws": [
"azur",
"blau"
]
}
]
},
{
"word": "think",
"synos": [
{
"pos": "vt.",
"tran": "想;认为;想起;想像;打算",
"ws": [
"like",
"imagine",
"expect",
"count",
"guess"
]
},
{
"pos": "vi.",
"tran": "想;认为",
"ws": [
"consider",
"ween"
]
},
{
"pos": "n.",
"tran": "想;想法",
"ws": [
"idea",
"idee"
]
},
{
"pos": "adj.",
"tran": "思想的",
"ws": [
"ideological",
"ideaistic"
]
}
]
}
]

1
js_node/save/all.json Normal file
View File

@@ -0,0 +1 @@
{"word":"advertisement","phonetic0":"ədˈːtɪsmənt","phonetic1":"ˌædvərˈtaɪzmənt","trans":[{"pos":"n.","cn":"广告;(某一类事物的)活广告;广告活动,广告宣传"}],"sentences":[{"v":"The advertisement is for a men's fragrance.","tran":"这则广告介绍一款男士香水。"},{"v":"We had over 100 replies to our advertisement.","tran":"我们的广告宣传收到了100多份答复。"},{"v":"This will make great copy for the advertisement.","tran":"这可当作这则广告的绝妙广告词。"}],"phrases":[{"v":"advertisement company","tran":"广告公司"},{"v":"commercial advertisement","tran":"商业广告"},{"v":"advertisement plan","tran":"广告策划"}],"synos":[{"pos":"n.","tran":"[经]广告,宣传","ws":["publicity","bill","propaganda","dissemination","drumbeating"]}],"relWords":{"root":"advert","rels":[{"words":[{"word":"advertising","cn":"广告的;广告业的"},{"word":"advertised","cn":"广告的"},{"word":"advertizing","cn":"广告的;广告业务的"}],"pos":"adj."},{"words":[{"word":"advertising","cn":"广告;广告业;登广告"},{"word":"advert","cn":"广告"},{"word":"advertiser","cn":"广告客户;刊登广告的人"},{"word":"advertorial","cn":"社论式广告(指常作为杂志中心插页的正式广告文字)"},{"word":"advertizer","cn":"广告客户;广告者"},{"word":"advertizing","cn":"广告,广告活动;广告业"}],"pos":"n."},{"words":[{"word":"advertising","cn":"公告为…做广告advertise的ing形式"},{"word":"advertizing","cn":"登广告advertize的ing形式"}],"pos":"v."},{"words":[{"word":"advertise","cn":"做广告,登广告;作宣传"},{"word":"advert","cn":"注意;谈到"},{"word":"advertised","cn":"登广告advertise的过去式和过去分词"},{"word":"advertize","cn":"做广告"}],"pos":"vi."},{"words":[{"word":"advertise","cn":"通知;为…做广告;使突出"},{"word":"advertize","cn":"做广告,登广告;通知"}],"pos":"vt."}]},"memory":""}

View File

@@ -1,5 +0,0 @@
expect
run
happy
blue
think

View File

@@ -49,6 +49,41 @@ export const PronunciationApi = 'https://dict.youdao.com/dictvoice?audio='
export type TranslateLanguageType = 'en' | 'zh-CN' | 'ja' | 'de' | 'common' | ''
export type LanguageType = 'en' | 'ja' | 'de' | 'code'
interface Word2 {
word: string,
phonetic0: string,
phonetic1: string,
trans: {
pos: string,
cn: string,
}[],
sentences: {
c: string,//content
cn: string,
}[],
phrases: {
c: string,
cn: string,
}[],
synos: {
pos: string,
cn: string,
ws: string[]
}[],
relWords: {
root: string,
rels: {
word: string,
cn: string,
}[]
},
etymology: {
t: string,//title
d: string,//desc
}[],
}
export type DictResource = {
id: string
name: string