feat:重构词典

2025-08-05 02:59:32 +08:00
parent 1be66fb5cf
commit d3278e581f
27 changed files with 679 additions and 20911 deletions
--- a/js_node/save/all
+++ b/js_node/save/all
--- a/js_node/save/allNew.min.json
+++ b/js_node/save/allNew.min.json
--- a/js_node/save/failed.json
+++ b/js_node/save/failed.json
--- a/js_node/test2.js
+++ b/js_node/test2.js
@@ -0,0 +1,109 @@
+import fs from 'fs';
+import path from 'path';
+import {chromium} from 'playwright';
+import pLimit from 'p-limit';
+import {fileURLToPath} from 'url';
+import dayjs from 'dayjs';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+// 路径设置
+const SOURCE_DIR = path.join(__dirname, 'source');
+const RESULT_DIR = path.join(__dirname, 'result');
+const TOTAL_RESULT_FILE = path.join(__dirname, 'save', 'all.json');
+const TOTAL_RESULT_FILE2 = path.join(__dirname, 'save', 'all2.json');
+const FAILED_FILE = path.join(__dirname, 'save', 'failed.json');
+
+// 控制参数
+const CONCURRENCY = 6;
+
+let failList = []
+
+// 创建结果目录
+if (!fs.existsSync(RESULT_DIR)) {
+  fs.mkdirSync(RESULT_DIR);
+}
+
+const existingMap = new Map();
+// 加载已爬数据（增量去重）
+if (fs.existsSync(TOTAL_RESULT_FILE)) {
+  const lines = fs.readFileSync(TOTAL_RESULT_FILE, 'utf-8').split('\n').filter(Boolean);
+  console.log(lines.length)
+  for (const line of lines) {
+    try {
+      const obj = JSON.parse(line);
+      if (obj?.word) {
+        existingMap.set(obj.word.toLowerCase(), {...obj, id: existingMap.size});
+      }
+    } catch {
+    }
+  }
+  console.log(`📦 已加载 ${existingMap.size} 个已爬词`);
+}
+
+const failStr = fs.readFileSync(FAILED_FILE, 'utf-8')
+
+if (failStr) {
+  failList = JSON.parse(failStr)
+}
+(async () => {
+  const files = fs.readdirSync(SOURCE_DIR).filter(f => f.endsWith('.json'));
+
+  for (const file of files) {
+    const filePath = path.join(SOURCE_DIR, file);
+    const raw = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
+
+    let result = [];
+    raw.forEach(obj => {
+      try {
+        let key = String(obj.name).toLowerCase();
+        let r = existingMap.get(key)
+        if (r) {
+          result.push({...r, word: String(r.word)});
+        } else {
+          try {
+            // console.log(`不存在：`, key)
+            let d = {
+              id: existingMap.size,
+              "word": String(obj.name),
+              "phonetic0": "",
+              "phonetic1": "",
+              "trans": [],
+              "sentences": [],
+              "phrases": [],
+              "synos": [],
+              "relWords": {"root": "", "rels": []},
+              "etymology": [],
+            }
+            if (Array.isArray(obj.trans)) {
+              d.trans = obj?.trans?.map((a) => ({pos: '', cn: a})) || []
+            } else {
+              d.trans = [{pos: '', cn: d.trans}]
+            }
+            existingMap.set(key, d);
+            result.push(d);
+          } catch (e) {
+            console.log('filePath:' + filePath, 'word:' + obj.name)
+            console.error(e);
+          }
+        }
+      } catch (e) {
+        console.log('--------filePath:' + filePath, 'word:' + JSON.stringify(obj));
+        console.error(e);
+      }
+    })
+
+    const outputName = path.basename(file, '.json') + '_v2.json';
+    const outputPath = path.join(RESULT_DIR, outputName);
+
+    fs.writeFileSync(outputPath, JSON.stringify(result, null, 2), 'utf-8');
+    // console.log(`✅ 已保存：${outputName}`);
+  }
+
+  console.log(`最终${existingMap.size}个单词`);
+  fs.writeFileSync(TOTAL_RESULT_FILE2, JSON.stringify(Array.from(existingMap), null, 2), 'utf-8');
+
+
+  console.log('\n🎉 所有任务完成！');
+})();
--- a/js_node/汇总.js
+++ b/js_node/汇总.js
@@ -0,0 +1,104 @@
+import fs from 'fs';
+import path from 'path';
+import {chromium} from 'playwright';
+import pLimit from 'p-limit';
+import {fileURLToPath} from 'url';
+import dayjs from 'dayjs';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+// 路径设置
+const SOURCE_DIR = path.join(__dirname, 'source');
+const SAVE_DIR = path.join(__dirname, 'save');
+const TOTAL_RESULT_FILE = path.join(__dirname, 'save', 'all.json');
+
+
+const existingMap = new Map();
+// 加载已爬数据（增量去重）
+if (fs.existsSync(TOTAL_RESULT_FILE)) {
+  const lines = fs.readFileSync(TOTAL_RESULT_FILE, 'utf-8').split('\n').filter(Boolean);
+  console.log(lines.length)
+  for (const line of lines) {
+    try {
+      const obj = JSON.parse(line);
+      if (obj?.word) {
+        existingMap.set(obj.word, obj);
+      }
+    } catch {
+    }
+  }
+  console.log(`📦 已加载 ${existingMap.size} 个已爬词`);
+}
+
+
+let normalList = new Map();
+let unnormalList = new Map();
+
+
+const safeString = (str) => (typeof str === 'string' ? str.trim() : '');
+const safeSplit = (str, sep) =>
+  safeString(str) ? safeString(str).split(sep).filter(Boolean) : [];
+
+
+function getTrans(trans) {
+  return safeSplit(trans, '\n').map(line => {
+    const match = line.match(/^([^\s.]+\.?)\s*(.*)$/);
+    if (match) {
+      let pos = safeString(match[1]);
+      let cn = safeString(match[2]);
+
+      // 如果 pos 不是常规词性（不以字母开头），例如 "【名】"
+      if (!/^[a-zA-Z]+\.?$/.test(pos)) {
+        cn = safeString(line); // 整行放到 cn
+        pos = ''; // pos 置空
+      }
+
+      return {pos, cn};
+    }
+    return {pos: '', cn: safeString(line)};
+  });
+}
+
+(async () => {
+  const files = fs.readdirSync(SOURCE_DIR).filter(f => f.endsWith('.json'));
+
+  for (const file of files) {
+    const filePath = path.join(SOURCE_DIR, file);
+    const raw = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
+    raw.filter(v => v && v.name && String(v.name).trim()).map(v => {
+      let word = String(v.name)
+      word = word.trim()
+      if (word.endsWith('.')) {
+        word = word.substring(0, word.length - 1);
+      }
+      let r = existingMap.get(word)
+      if (!r) {
+        r = {
+          "word": String(word),
+          "phonetic0": v?.ukphone?.replaceAll('[', '')?.replaceAll(']', '') || '',
+          "phonetic1": v?.usphone?.replaceAll('[', '')?.replaceAll(']', '') || '',
+          "trans": [],
+          "sentences": [],
+          "phrases": [],
+          "synos": [],
+          "relWords": {"root": "", "rels": []},
+          "etymology": [],
+        }
+        if (Array.isArray(v.trans)) {
+          r.trans = getTrans(v.trans.filter(a => a && a.length < 150).slice(0, 3).join('\n'));
+        } else {
+          r.trans = v.trans ? getTrans(v.trans) : [];
+        }
+        if (word.includes('/') || word.includes(' ') || word.includes('(') || word.includes(')') || word.includes('（') || word.includes('）')) {
+          unnormalList.set(word, r)
+        } else {
+          normalList.set(word, r)
+        }
+      }
+    })
+  }
+  console.log(normalList.size, unnormalList.size)
+  fs.writeFileSync(path.join(SAVE_DIR, 'normalList.json'), JSON.stringify(Array.from(normalList.values()), null, 2), 'utf-8');
+  fs.writeFileSync(path.join(SAVE_DIR, 'unnormalList.json'), JSON.stringify(Array.from(unnormalList.values()), null, 2), 'utf-8');
+})();
--- a/js_node/爬虫.js
+++ b/js_node/爬虫.js
@@ -0,0 +1,198 @@
+import fs from 'fs';
+import path from 'path';
+import {chromium} from 'playwright';
+import pLimit from 'p-limit';
+import {fileURLToPath} from 'url';
+import dayjs from 'dayjs';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+// 路径设置
+const SOURCE_DIR = path.join(__dirname, 'source');
+const RESULT_DIR = path.join(__dirname, 'result');
+const TOTAL_RESULT_FILE = path.join(__dirname, 'save', 'all.json');
+const normalList_FILE = path.join(__dirname, 'save', 'normalList.json');
+const unnormalList_FILE = path.join(__dirname, 'save', 'unnormalList.json');
+const FAILED_FILE = path.join(__dirname, 'save', 'failed.json');
+
+// 控制参数
+const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
+const MAX_COUNT = 999999999999;
+
+let failList = []
+let crawlCount = 0;
+const existingMap = new Map();
+
+// 创建结果目录
+if (!fs.existsSync(RESULT_DIR)) {
+  fs.mkdirSync(RESULT_DIR);
+}
+
+// 加载已爬数据（增量去重）
+if (fs.existsSync(TOTAL_RESULT_FILE)) {
+  const lines = fs.readFileSync(TOTAL_RESULT_FILE, 'utf-8').split('\n').filter(Boolean);
+  for (const line of lines) {
+    try {
+      const obj = JSON.parse(line);
+      if (obj?.word) {
+        existingMap.set(obj.word.toLowerCase(), obj);
+      }
+    } catch {
+    }
+  }
+  console.log(`📦 已加载 ${existingMap.size} 个已爬词`);
+}
+
+const failStr = fs.readFileSync(FAILED_FILE, 'utf-8')
+
+if (failStr) {
+  failList = JSON.parse(failStr)
+}
+
+function addToFail(val) {
+  if (!failList.find(v => v.word === val.word)) {
+    failList.push(val);
+    fs.writeFileSync(FAILED_FILE, JSON.stringify(failList, null, 2), 'utf-8');
+  }
+}
+
+// 爬虫主函数
+async function crawlWord(val, page, retry = 0, failName) {
+  let word = val.word
+  const data = val
+  const url = `https://www.youdao.com/result?word=${encodeURIComponent(word)}&lang=en`;
+
+  try {
+    await page.goto(url, {waitUntil: 'networkidle', timeout: 15000});
+
+    const titleEl = await page.locator('.title').first();
+    data.word = await titleEl.evaluate(el => el.firstChild?.nodeValue || '');
+
+    const phones = await page.$$('.per-phone .phonetic');
+    if (phones[0]) data.phonetic0 = (await phones[0].textContent())?.trim() || '';
+    if (phones[1]) data.phonetic1 = (await phones[1].textContent())?.trim() || '';
+    data.phonetic0 = data.phonetic0.replaceAll('/', '').trim()
+    data.phonetic1 = data.phonetic1.replaceAll('/', '').trim()
+
+    for (const el of await page.$$('.basic .word-exp')) {
+      const pos = await el.$('.pos');
+      const tran = await el.$('.trans');
+      data.trans.push({
+        pos: pos ? (await pos.textContent())?.trim() : '',
+        cn: tran ? (await tran.textContent())?.trim() : '',
+      });
+    }
+
+    for (const el of await page.$$('.blng_sents_part .trans-container ul li .col2')) {
+      const en = await el.$('.sen-eng');
+      const ch = await el.$('.sen-ch');
+      data.sentences.push({
+        c: en ? (await en.textContent())?.trim() : '',
+        cn: ch ? (await ch.textContent())?.trim() : '',
+      });
+    }
+
+    for (const el of await page.$$('.phrs ul li .phrs-content')) {
+      const point = await el.$('.point');
+      const tran = await el.$('.phr_trans');
+      data.phrases.push({
+        c: point ? (await point.textContent())?.trim() : '',
+        cn: tran ? (await tran.textContent())?.trim() : '',
+      });
+    }
+
+    try {
+      await page.getByText('同近义词', {timeout: 2000}).click();
+      await page.waitForSelector('.syno', {timeout: 3000});
+      for (const el of await page.$$('.syno-item')) {
+        const pos = await el.$('.index');
+        const tran = await el.$('.synptran');
+        const wordEl = await el.$('.clickable');
+        let str = wordEl ? (await wordEl.textContent())?.trim() : '';
+        data.synos.push({
+          pos: pos ? (await pos.textContent())?.trim() : '',
+          cn: tran ? (await tran.textContent())?.trim() : '',
+          ws: str.split('/').map(s => s.trim()).filter(Boolean),
+        });
+      }
+    } catch {
+    }
+
+    try {
+      await page.getByText('同根词', {timeout: 2000}).click();
+      await page.waitForSelector('.rel_word', {timeout: 3000});
+      const cigen = await page.$('.trans-container > p .point');
+      data.relWords.root = cigen ? (await cigen.textContent())?.trim() : '';
+      for (const el of await page.$$('.rel_word_item')) {
+        let item = {pos: '', words: []};
+        const pos = await el.$('.pos');
+        item.pos = pos ? (await pos.textContent())?.trim() : '';
+        for (const el2 of await el.$$('.rel_content p')) {
+          const word = await el2.$('.point');
+          let wordStr = word ? (await word.textContent())?.trim() : '';
+          let str = el2 ? (await el2.textContent())?.trim() : '';
+          str = str.replace(wordStr, '');
+          item.words.push({c: wordStr, cn: str});
+        }
+        data.relWords.rels.push(item);
+      }
+    } catch {
+    }
+
+    try {
+      await page.getByText('词源', {timeout: 2000}).click();
+      await page.waitForSelector('.etymology', {timeout: 3000});
+      for (const el of await page.$$('.trans-cell')) {
+        const header = await el.$('.header');
+        const zh_result = await el.$('.zh_result');
+        data.etymology.push({
+          t: header ? (await header.textContent())?.trim() : '',
+          d: zh_result ? (await zh_result.textContent())?.trim() : '',
+        });
+      }
+    } catch {
+    }
+
+    return data;
+  } catch (err) {
+    return data;
+
+    if (retry < 2) {
+      console.log(`🔁 ${word} 抓取失败，重试中...`);
+      await sleep(1000);
+      return crawlWord(val, page, retry + 1, failName);
+    } else {
+      console.log(`❌ ${word} 抓取失败`);
+      addToFail(val)
+      return data;
+    }
+  }
+}
+
+(async () => {
+  const browser = await chromium.launch({headless: true});
+  const page = browser.newPage()
+
+  async function start(file) {
+    const raw = JSON.parse(fs.readFileSync(file, 'utf-8'));
+    const resultMap = new Map();
+
+    for (let i = 0; i < MAX_COUNT; i++) {
+      let word = raw[i];
+      console.log(`爬取：${file}，${word.word}，进度：${resultMap.size} / ${raw.length}；时间：${dayjs().format('YYYY-MM-DD HH:mm:ss')}`)
+      const result = await crawlWord(word, page, 0, file);
+      if (result) {
+        resultMap.set(word.word, result);
+        fs.writeFileSync(file.replaceAll('.json', '-fetch.json'), JSON.stringify(Array.from(resultMap.values()), null, 2), 'utf-8');
+      }
+      await sleep(2300);
+    }
+  }
+
+  await start(unnormalList_FILE)
+  await start(normalList_FILE)
+  await browser.close();
+
+  console.log('\n🎉 所有任务完成！');
+})();