diff --git a/src/hooks/article.ts b/src/hooks/article.ts index f84fd8f2..e5543bc1 100644 --- a/src/hooks/article.ts +++ b/src/hooks/article.ts @@ -1,6 +1,5 @@ -import { Article, ArticleWord, DictId, Sentence } from "@/types/types.ts"; +import { Article, DictId, PracticeArticleWordType, Sentence } from "@/types/types.ts"; import { _nextTick, cloneDeep } from "@/utils"; -import nlp from "compromise/one"; import { usePlayWordAudio } from "@/hooks/sound.ts"; import { getSentenceAllText, getSentenceAllTranslateText } from "@/hooks/translate.ts"; import { getDefaultArticleWord } from "@/types/func.ts"; @@ -8,23 +7,6 @@ import { useSettingStore } from "@/stores/setting.ts"; import { useBaseStore } from "@/stores/base.ts"; import { useRuntimeStore } from "@/stores/runtime.ts"; -interface KeyboardMap { - Period: string, - Comma: string, - Slash: string, - Exclamation: string, - QuoteLeft: string, - QuoteRight: string, -} - -export const EnKeyboardMap: KeyboardMap = { - Period: '.', - Comma: ',', - Slash: '?', - Exclamation: '!', - QuoteLeft: `'`, - QuoteRight: `'`, -} function parseSentence(sentence: string) { // 先统一一些常见的“智能引号” -> 直引号,避免匹配问题 @@ -50,7 +32,7 @@ function parseSentence(sentence: string) { // 1) 货币 + 数字($1,000.50 或 ¥200 或 €100.5) let m = rest.match(/^[\$¥€£]\d{1,3}(?:,\d{3})*(?:\.\d+)?%?/); if (m) { - tokens.push({word: m[0], start: i, end: i + m[0].length, isSymbol: false}); + tokens.push({word: m[0], start: i, end: i + m[0].length, type: PracticeArticleWordType.Number}); i += m[0].length; continue; } @@ -58,7 +40,7 @@ function parseSentence(sentence: string) { // 2) 数字/小数/百分比(100% 3.14 1,000.00) m = rest.match(/^\d{1,3}(?:,\d{3})*(?:\.\d+)?%?/); if (m) { - tokens.push({word: m[0], start: i, end: i + m[0].length, isSymbol: false}); + tokens.push({word: m[0], start: i, end: i + m[0].length, type: PracticeArticleWordType.Number}); i += m[0].length; continue; } @@ -66,7 +48,7 @@ function parseSentence(sentence: string) { // 3) 带点缩写或多段缩写(U.S. U.S.A. e.g. i.e. Ph.D.) m = rest.match(/^[A-Za-z]+(?:\.[A-Za-z]+)+\.?/); if (m) { - tokens.push({word: m[0], start: i, end: i + m[0].length, isSymbol: false}); + tokens.push({word: m[0], start: i, end: i + m[0].length, type: PracticeArticleWordType.Word}); i += m[0].length; continue; } @@ -74,7 +56,7 @@ function parseSentence(sentence: string) { // 4) 单词(包含撇号/连字符,如 it's, o'clock, we'll, mother-in-law) m = rest.match(/^[A-Za-z0-9]+(?:[\'\-][A-Za-z0-9]+)*/); if (m) { - tokens.push({word: m[0], start: i, end: i + m[0].length, isSymbol: false}); + tokens.push({word: m[0], start: i, end: i + m[0].length, type: PracticeArticleWordType.Word}); i += m[0].length; continue; } @@ -82,13 +64,13 @@ function parseSentence(sentence: string) { // 5) 其它可视符号(标点)——单字符处理(连续标点会被循环拆为单字符) // 包括:.,!?;:"'()-[]{}<>/\\@#%^&*~`等非单词非空白字符 if (/[^\w\s]/.test(ch)) { - tokens.push({word: ch, start: i, end: i + 1, isSymbol: true}); + tokens.push({word: ch, start: i, end: i + 1, type: PracticeArticleWordType.Symbol}); i += 1; continue; } // 6) 回退方案:把当前字符当作一个 token(防止意外丢失) - tokens.push({word: ch, start: i, end: i + 1, isSymbol: /[^\w\s]/.test(ch)}); + tokens.push({word: ch, start: i, end: i + 1, type: PracticeArticleWordType.Symbol}); i += 1; } @@ -97,7 +79,7 @@ function parseSentence(sentence: string) { const next = tokens[idx + 1]; const between = next ? sentence.slice(t.end, next.start) : sentence.slice(t.end); const nextSpace = /\s/.test(between); - return getDefaultArticleWord({word: t.word, nextSpace, isSymbol: !!t.isSymbol}); + return getDefaultArticleWord({word: t.word, nextSpace, type: t.type}); }); return result; @@ -175,235 +157,140 @@ export function genArticleSectionData(article: Article): number { } export function splitEnArticle2(text: string): string { - if (!text) { - text = `Last week I went to the theatre. I had a very good seat. The play was very interesting. I did not enjoy it. A young man and a young woman were sitting behind me. They were talking loudly. I got very angry. I could not hear the actors. I turned round. I looked at the man and the woman angrily. They did not pay any attention. In the end, I could not bear it. I turned round again. I cant hear a word! I said angrily. -Its none of your business, the young man said rudely. This is a private conversation!` - // text = `While it is yet to be seen what direction the second Trump administration will take globally in its China policy, VOA traveled to the main island of Mahe in Seychelles to look at how China and the U.S. have impacted the country, and how each is fairing in that competition for influence there.` + text = text.trim() + if (!text && false) { +// text = `It was Sunday. I never get up early on Sundays. I sometimes stay in bed until lunchtime. Last Sunday I got up very late. I looked out of the window. It was dark outside. 'What a day!' I thought. 'It's raining again. ' Just then, the telephone rang. It was my aunt Lucy. 'I've just arrived by train, ' she said. 'I'm coming to see you. ' +// +// 'But I'm still having breakfast, ' I said. +// 'What are you doing?' she asked. +// 'I'm having breakfast, ' I repeated. +// 'Dear me,$3.000' she said. 'Do you always get up so late? It's one o'clock!'` +// text = `While it is yet to be seen what direction the second Trump administration will take globally in its China policy, VOA traveled to the main island of Mahe in Seychelles to look at how China and the U.S. have impacted the country, and how each is fairing in that competition for influence there.` // text = "It was Sunday. I never get up early on Sundays. I sometimes stay in bed until lunchtime. Last Sunday I got up very late. I looked out of the window. It was dark outside. 'What a day!' I thought. 'It's raining again.' Just then, the telephone rang. It was my aunt Lucy. 'I've just arrived by train,' she said. 'I'm coming to see you.'\n\n 'But I'm still having breakfast,' I said.\n\n 'What are you doing?' she asked.\n\n 'I'm having breakfast,' I repeated.\n\n 'Dear me,' she said. 'Do you always get up so late? It's one o'clock!'" } - //将中文符号替换 - text = text.replaceAll('’', "'") - text = text.replaceAll('—', "-") - text = text.replaceAll('”', '"') - text = text.replaceAll('“', '"') - // console.time() - let keyboardMap = EnKeyboardMap - let sections: Sentence[][] = [] - let sectionTextList = text.replaceAll('\n\n', '`^`').replaceAll('\n', '').split('`^`') - // console.log(sectionTextList); - sectionTextList.filter(v => v).map((sectionText, i) => { - let section: Sentence[] = [] - sections.push(section) - sectionText = sectionText.trim() + if (!text) return ''; - let doc = nlp(sectionText) - let sentenceNlpList = [] - doc.json().map(item => { + const abbreviations = [ + 'Mr', 'Mrs', 'Ms', 'Dr', 'Prof', 'Sr', 'Jr', + 'St', 'Co', 'Ltd', 'Inc', 'e.g', 'i.e', 'U.S.A', 'U.S', 'U.K', 'etc' + ]; - //如果整句大于15个单词以上,检测是否有 逗号子句 - if (item.terms.length > 15) { - //正则匹配“逗号加and|but|so|because" - let list = item.text.split(/,\s(?=(and|but|so|because)\b)/).filter(_ => { - //匹配完之后会把and|but|so|because也提出来,这里不需要重复的,直接筛选掉 - if (_ && !['and', 'but', 'so', 'because'].includes(_)) return _ - }) - if (list.length === 1) { - sentenceNlpList.push(item) + function isSentenceEnd(text, idx) { + const before = text.slice(0, idx + 1); + const after = text.slice(idx + 1); + + const abbrevPattern = new RegExp('\\b(' + abbreviations.join('|') + ')\\.$', 'i'); + if (abbrevPattern.test(before)) return false; + if (/\d+\.$/.test(before)) return false; + if (/\d+\.\d/.test(text.slice(idx - 1, idx + 2))) return false; + if (/%/.test(after)) return false; + if (/[\$¥€]\d/.test(before + after)) return false; + + return true; + } + + function normalizeQuotes(text) { + const isWord = ch => /\w/.test(ch); + let res = []; + let singleOpen = false; + let doubleOpen = false; + for (let i = 0; i < text.length; i++) { + const ch = text[i]; + if (ch === "'") { + const prev = i > 0 ? text[i - 1] : ''; + const nxt = i + 1 < text.length ? text[i + 1] : ''; + if (isWord(prev) && isWord(nxt)) { + res.push("'"); + continue; + } + if (singleOpen) { + if (res.length && res[res.length - 1] === ' ') res.pop(); + res.push("'"); + singleOpen = false; } else { - list.map((text, i) => { - //分割后每句都没有逗号了,所以除了最后一句外需要加回来 - sentenceNlpList = sentenceNlpList.concat(nlp(text + (i !== list.length - 1 ? ',' : '')).json()) - }) + res.push("'"); + singleOpen = true; + } + } else if (ch === '"') { + if (doubleOpen) { + if (res.length && res[res.length - 1] === ' ') res.pop(); + res.push('"'); + doubleOpen = false; + } else { + res.push('"'); + doubleOpen = true; } } else { - sentenceNlpList.push(item) + res.push(ch); } - }) + } + return res.join(''); + } - sentenceNlpList.map(item => { - let sentence: Sentence = cloneDeep({ - //他没有空格,导致修改一行一行的数据时,汇总时全没有空格了,库无法正常断句 - text: item.text + ' ', - // text: '', - translate: '', - words: [], - audioPosition: [0, 0], - }) - section.push(sentence) + let rawParagraphs = text.replaceAll('\n\n', '`^`').replaceAll('\n', '').split('`^`') - const checkQuote = (pre: string, index?: number) => { - let nearSymbolPosition = null - if (index === 0) { - nearSymbolPosition = 'end' - } else { - //TODO 可以优化成for+break - section.slice().reverse().map((sentenceItem, b) => { - sentenceItem.words.slice().reverse().map((wordItem, c) => { - if (wordItem.symbolPosition !== '' && nearSymbolPosition === null) { - nearSymbolPosition = wordItem.symbolPosition - } - }) - }) - } + const formattedParagraphs = rawParagraphs.map(p => { + p = p.trim(); + if (!p) return ''; - let word3: ArticleWord = getDefaultArticleWord({ - word: pre, - nextSpace: false, - isSymbol: true, - symbolPosition: '' - }); - // console.log('rrr', item) - // console.log('nearSymbolPosition', nearSymbolPosition) - if (nearSymbolPosition === 'end' || nearSymbolPosition === null) { - word3.symbolPosition = 'start' - sentence.words.push(word3) - } else { - sentence.words[sentence.words.length - 1].nextSpace = false - word3.symbolPosition = 'end' - word3.nextSpace = true + p = p.replace(/\n/g, ' '); + p = normalizeQuotes(p); - let addCurrent = false - sentence.words.slice().reverse().map((wordItem, c) => { - if (wordItem.symbolPosition === 'start' && !addCurrent) { - addCurrent = true - } - }) - if (addCurrent) { - sentence.words.push(word3) + const tentative: string[] = p.match(/[^.!?。!?]+[.!?。!?'"”’)]*/g) || []; + + const sentences = []; + tentative.forEach(segment => { + segment = segment.trim(); + if (!segment) return; + + const lastCharIdx = segment.length - 1; + if (/[.!?。!?]/.test(segment[lastCharIdx])) { + const globalIdx = p.indexOf(segment); + if (!isSentenceEnd(p, globalIdx + segment.length - 1)) { + if (sentences.length > 0) { + sentences[sentences.length - 1] += ' ' + segment; } else { - // 'Do you always get up so late? It'LICENSE one o'clock!' 会被断成两句 - let lastSentence = section[section.length - 2] - lastSentence.words = lastSentence.words.concat(sentence.words) - lastSentence.words.push(word3) - sentence.words = [] - //这里还不能直接删除sentence,因为后面还有一个 sentence.words = sentence.words.filter(v => v.word !== 'placeholder') 的判断 - // section.pop() + sentences.push(segment); } + return; } } + sentences.push(segment); + }); - const checkSymbol = (post: string, nextSpace: boolean = true) => { - switch (post) { - case keyboardMap.Period: - case keyboardMap.Comma: - case keyboardMap.Slash: - case keyboardMap.Exclamation: - sentence.words[sentence.words.length - 1].nextSpace = false - let word2 = getDefaultArticleWord({ - word: post, - isSymbol: true, - nextSpace - }); - sentence.words.push(word2) - break - case keyboardMap.QuoteLeft: - case ')': - checkQuote(post) - break - case `.'`: - case `!'`: - case `?'`: - case `,'`: - case `*'`: - post.split('').map(v => { - checkSymbol(v, false) - }) - break - //类似于这种的“' -- ”的。需要保留空格,用了一个占位符才处理,因为每个符号都会把前面的那个字符的nextSpace改为false - case ' ': - // console.log('sentence', sentence) - //遇到“The clock has stopped!' I looked at my watch.” - //检测到stopped!' 的'时,如果前引号不在当前句,会把当前句的word合并到前一句。那么当前句的word就为空了,会报错 - //所以需要检测一下 - if (sentence.words.length) { - sentence.words[sentence.words.length - 1].nextSpace = true - let word3 = getDefaultArticleWord({ - word: 'placeholder', - isSymbol: true, - nextSpace: false, - }); - sentence.words.push(word3) - } - break - default: - // console.log('post', post) - //这里多半是一些奇怪的连接符之类的 - if (post.length > 1) { - post.split('').map(v => { - checkSymbol(v, false) - }) - } else { - sentence.words[sentence.words.length - 1].nextSpace = false - let word3 = getDefaultArticleWord({ - word: post, - isSymbol: true, - nextSpace: false, - }); - sentence.words.push(word3) - } - break + const finalSentences = []; + let i = 0; + while (i < sentences.length) { + let cur = sentences[i]; + if (i + 1 < sentences.length) { + const nxt = sentences[i + 1]; + if (/['"”’)\]]$/.test(cur) && /^[a-z]|^(I|You|She|He|They|We)\b/i.test(nxt)) { + finalSentences.push(cur + ' ' + nxt); + i += 2; + continue; } } + finalSentences.push(cur); + i += 1; + } - item.terms.map((v, index: number) => { - // console.log('v', v) - if (v.text) { - let pre: string = v.pre.trim() - if (pre) { - checkQuote(pre, index) - } + return finalSentences.join('\n'); + }); - let word = getDefaultArticleWord({word: v.text, nextSpace: true}); - sentence.words.push(word) - - let post: string = v.post - //判断是不是等于空,因为正常的词后面都会有个空格。这种不需要处理。 - if (post && post !== ' ') { - checkSymbol(post.trim()) - } - } - }) - - //去除空格占位符 - sentence.words = sentence.words.filter(v => v.word !== 'placeholder') - //如果是空的,直接去掉 - if (!sentence.words.length) { - section.pop() - } - }) - // console.log(sentenceNlpList) - }) - - sections = sections.filter(sectionItem => sectionItem.length) - sections.map((sectionItem, a) => { - sectionItem.map((sentenceItem, b) => { - sentenceItem.text = sentenceItem.words.reduce((previousValue: string, currentValue) => { - previousValue += currentValue.word + (currentValue.nextSpace ? ' ' : '') - return previousValue - }, '') - }) - }) - - // console.log(sections) - - //这里在每一行结尾处,加一个空格,因为. 号后面必要要有空格才能被库正常短句 - text = sections.map(v => v.map(s => s.text.trim()).join(' \n')).join(' \n\n'); - // console.log('s',text) - // return text - return text + return formattedParagraphs.filter(p => p).join('\n\n'); } export function splitCNArticle2(text: string): string { - if (!text) { + if (!text && false) { // text = "飞机误点了,侦探们在机场等了整整一上午。他们正期待从南非来的一个装着钻石的贵重包裹。数小时以前,有人向警方报告,说有人企图偷走这些钻石。当飞机到达时,一些侦探等候在主楼内,另一些侦探则守候在停机坪上。有两个人把包裹拿下飞机,进了海关。这时两个侦探把住门口,另外两个侦探打开了包裹。令他们吃惊的是,那珍贵的包裹里面装的全是石头和沙子!" - text = `那是个星期天,而在星期天我是从来不早起的,有时我要一直躺到吃午饭的时候。上个星期天,我起得很晚。我望望窗外,外面一片昏暗。“鬼天气!”我想,“又下雨了。”正在这时,电话铃响了。是我姑母露西打来的。“我刚下火车,”她说,“我这就来看你。” -“但我还在吃早饭,”我说。 -“你在干什么?”她问道。 -“我正在吃早饭,”我又说了一遍。 -“天啊,”她说,“你总是起得这么晚吗?现在已经1点钟了!”` - text = `上星期我去看戏。我的座位很好,戏很有意思,但我却无法欣赏。一青年男子与一青年女子坐在我的身后,大声地说着话。我非常生气,因为我听不见演员在说什么。我回过头去怒视着那一男一女,他们却毫不理会。最后,我忍不住了,又一次回过头去,生气地说:“我一个字也听不见了!” -“不关你的事,”那男的毫不客气地说,“这是私人间的谈话!”` +// text = `那是个星期天,而在星期天我是从来不早起的,有时我要一直躺到吃午饭的时候。上个星期天,我起得很晚。我望望窗外,外面一片昏暗。“鬼天气!”我想,“又下雨了。”正在这时,电话铃响了。是我姑母露西打来的。“我刚下火车,”她说,“我这就来看你。” +// “但我还在吃早饭,”我说。 +// “你在干什么?”她问道。 +// “我正在吃早饭,”我又说了一遍。 +// “天啊,”她说,“你总是起得这么晚吗?现在已经1点钟了!”` +// text = `上星期我去看戏。我的座位很好,戏很有意思,但我却无法欣赏。一青年男子与一青年女子坐在我的身后,大声地说着话。我非常生气,因为我听不见演员在说什么。我回过头去怒视着那一男一女,他们却毫不理会。最后,我忍不住了,又一次回过头去,生气地说:“我一个字也听不见了!” +// “不关你的事,”那男的毫不客气地说,“这是私人间的谈话!”` } const segmenterJa = new Intl.Segmenter("zh-CN", {granularity: "sentence"}); diff --git a/src/pages/article/PracticeArticles.vue b/src/pages/article/PracticeArticles.vue index 2beccd58..23e1bf95 100644 --- a/src/pages/article/PracticeArticles.vue +++ b/src/pages/article/PracticeArticles.vue @@ -4,14 +4,24 @@ import { computed, onMounted, onUnmounted, provide, watch } from "vue"; import { useBaseStore } from "@/stores/base.ts"; import { emitter, EventKey, useEvents } from "@/utils/eventBus.ts"; import { useSettingStore } from "@/stores/setting.ts"; -import { Article, ArticleItem, ArticleWord, Dict, DictType, ShortcutKey, Statistics, Word } from "@/types/types.ts"; +import { + Article, + ArticleItem, + ArticleWord, + Dict, + DictType, + PracticeArticleWordType, + ShortcutKey, + Statistics, + Word +} from "@/types/types.ts"; import { useDisableEventListener, useOnKeyboardEventListener, useStartKeyboardEventListener } from "@/hooks/event.ts"; import useTheme from "@/hooks/theme.ts"; import Toast from '@/components/base/toast/Toast.ts' -import { _getDictDataByUrl, _nextTick, cloneDeep, msToHourMinute, msToMinute, total } from "@/utils"; +import { _getDictDataByUrl, _nextTick, cloneDeep, msToMinute, total } from "@/utils"; import { usePracticeStore } from "@/stores/practice.ts"; import { useArticleOptions } from "@/hooks/dict.ts"; -import { genArticleSectionData, syncBookInMyStudyList, usePlaySentenceAudio } from "@/hooks/article.ts"; +import { genArticleSectionData, usePlaySentenceAudio } from "@/hooks/article.ts"; import { getDefaultArticle, getDefaultDict, getDefaultWord } from "@/types/func.ts"; import TypingArticle from "@/pages/article/components/TypingArticle.vue"; import BaseIcon from "@/components/BaseIcon.vue"; @@ -24,9 +34,7 @@ import { useRoute, useRouter } from "vue-router"; import book_list from "@/assets/book-list.json"; import PracticeLayout from "@/components/PracticeLayout.vue"; import Switch from "@/components/base/Switch.vue"; -import Audio from "@/components/base/Audio.vue"; import ArticleAudio from "@/pages/article/components/ArticleAudio.vue"; -import dayjs from "dayjs"; import { PracticeSaveArticleKey } from "@/utils/const.ts"; const store = useBaseStore() @@ -209,7 +217,7 @@ function setArticle(val: Article) { articleData.article.sections.map((v, i) => { v.map((w) => { w.words.map(s => { - if (!ignoreList.includes(s.word.toLowerCase()) && !s.isSymbol) { + if (!ignoreList.includes(s.word.toLowerCase()) && s.type === PracticeArticleWordType.Word) { statStore.total++ } }) @@ -307,7 +315,7 @@ function wrong(word: Word) { } function nextWord(word: ArticleWord) { - if (!store.allIgnoreWords.includes(word.word.toLowerCase()) && !word.isSymbol) { + if (!store.allIgnoreWords.includes(word.word.toLowerCase()) && word.type === PracticeArticleWordType.Word) { statStore.inputWordNumber++ } } diff --git a/src/pages/article/components/EditArticle.vue b/src/pages/article/components/EditArticle.vue index fd776f03..e6327efb 100644 --- a/src/pages/article/components/EditArticle.vue +++ b/src/pages/article/components/EditArticle.vue @@ -79,7 +79,7 @@ function apply(isHandle: boolean = true) { //分句原文 function splitText() { - editArticle.text = splitEnArticle2(editArticle.text.trim()) + editArticle.text = splitEnArticle2(editArticle.text) } //分句翻译 diff --git a/src/pages/article/components/TypingArticle.vue b/src/pages/article/components/TypingArticle.vue index 5bbcc623..2252e8fb 100644 --- a/src/pages/article/components/TypingArticle.vue +++ b/src/pages/article/components/TypingArticle.vue @@ -1,6 +1,6 @@