update article.ts

This commit is contained in:
zyronon
2023-09-27 18:48:48 +08:00
parent 0bf4cfe61b
commit 5bbc068a1f
9 changed files with 1017 additions and 1644 deletions

13
Note.md
View File

@@ -5,4 +5,15 @@ sentence-splitter
无法正常断句(在引号里面的问号和感叹号会被断句),但可以识别例句
https://github.com/Tessmore/sbd
一切正常
无法识别Mr.James Scott has a garage in Silbury and now he has just bought another garage in Pinhurst.
会将Mr.James 断句
wink-nlp
'What a day!' I thought.
'What are you doing?' she asked.
会被断句
compromise
表现良好,另外自带分词功能
以上所有库都会将:'Do you always get up so late? It's one o'clock!'分成两句....

4
components.d.ts vendored
View File

@@ -21,15 +21,11 @@ declare module 'vue' {
DictModal: typeof import('./src/components/Toolbar/DictModal.vue')['default']
EditAbleText: typeof import('./src/components/EditAbleText.vue')['default']
ElInput: typeof import('element-plus/es')['ElInput']
ElInputNumber: typeof import('element-plus/es')['ElInputNumber']
ElOption: typeof import('element-plus/es')['ElOption']
ElProgress: typeof import('element-plus/es')['ElProgress']
ElRadio: typeof import('element-plus/es')['ElRadio']
ElRadioButton: typeof import('element-plus/es')['ElRadioButton']
ElRadioGroup: typeof import('element-plus/es')['ElRadioGroup']
ElSelect: typeof import('element-plus/es')['ElSelect']
ElSlider: typeof import('element-plus/es')['ElSlider']
ElSwitch: typeof import('element-plus/es')['ElSwitch']
FeedbackModal: typeof import('./src/components/Toolbar/FeedbackModal.vue')['default']
Fireworks: typeof import('./src/components/Fireworks.vue')['default']
Footer: typeof import('./src/components/Practice/Footer.vue')['default']

View File

@@ -18,6 +18,7 @@
"@opentranslate/translator": "^1.4.2",
"@types/uuid": "^9.0.4",
"axios": "^1.5.0",
"compromise": "^14.10.0",
"copy-to-clipboard": "^3.3.3",
"element-plus": "^2.3.9",
"hover.css": "^2.3.2",
@@ -25,8 +26,6 @@
"lodash-es": "^4.17.21",
"mitt": "^3.0.1",
"pinia": "^2.1.6",
"sbd": "^1.0.19",
"sentence-splitter": "^4.2.1",
"swiper": "^10.1.0",
"tesseract.js": "^4.1.1",
"uuid": "^9.0.1",

1680
pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

View File

@@ -5,7 +5,7 @@ import {$ref} from "vue/macros";
import {cloneDeep} from "lodash-es";
import {DefaultArticle} from "@/types.ts";
import {onMounted, watch} from "vue";
import {splitEnArticle2} from "@/hooks/article.ts";
import {splitCNArticle, splitEnArticle} from "@/hooks/article.ts";
const base = useBaseStore()
@@ -17,16 +17,15 @@ let articleData = $ref({
stringIndex: 0,
})
onMounted(()=>{
setTimeout(()=>{
splitEnArticle2(base.currentEditDict.articles[2].text)
},100)
onMounted(() => {
// splitEnArticle('')
splitCNArticle('')
})
</script>
<template>
<div id="BatchAddArticle">
<!-- <AddArticle2/>-->
<AddArticle2/>
</div>
</template>

View File

@@ -1,7 +1,7 @@
import {DefaultArticleWord, Sentence, Word} from "@/types.ts";
import {cloneDeep, indexOf} from "lodash-es";
import {split} from 'sentence-splitter'
import tokenizer from 'sbd'
import {ArticleWord, DefaultArticleWord, Sentence} from "@/types.ts";
import {cloneDeep} from "lodash-es";
import nlp from "compromise";
import {split} from "sentence-splitter";
interface KeyboardMap {
Period: string,
@@ -25,329 +25,17 @@ export const EnKeyboardMap: KeyboardMap = {
Comma: ',',
Slash: '?',
Exclamation: '!',
QuoteLeft: '"',
QuoteRight: '"',
QuoteLeft: `'`,
QuoteRight: `'`,
}
export function splitEnArticle(text: string, lang: string = 'en', keyboardMap: KeyboardMap = EnKeyboardMap): {
sections: Sentence[][],
newText: string
} {
let sections: Sentence[][] = []
let section: Sentence[] = []
let sentence: Sentence = {
text: '',
translate: '',
words: []
}
section.push(sentence)
sections.push(section)
let word = cloneDeep({...DefaultArticleWord, name: '', nextSpace: true});
//去除头和尾部的空格
text = text.trim()
//加\n用于添加最后一段
text += '\n'
text = text.replaceAll(``, '"')
text = text.replaceAll(``, '"')
text = text.replaceAll(``, '"')
text = text.replaceAll(``, '"')
//替换所有单引号为双引号
text = text.replaceAll(`'`, '"')
//将缩写词的双引号替换回单引号
text = text.replaceAll(`"t`, `'t`)
text = text.replaceAll(`"s`, `'s`)
text = text.replaceAll(`"S`, `'S`)
text = text.replaceAll(`"m`, `'m`)
text = text.replaceAll(`"d`, `'d`)
text = text.replaceAll(`"ve`, `'ve`)
text = text.replaceAll(`"clock`, `'clock`)
console.log('splitEnArticle', text)
// console.log('splitEnArticle length', text.length)
text.split('').map((v, i, arr) => {
// if (i > 2306) debugger
switch (v) {
case ' ':
if (word.name) {
sentence.words.push(word)
word = cloneDeep(DefaultArticleWord)
}
break
case keyboardMap.Period:
case keyboardMap.Comma:
case keyboardMap.Slash:
case keyboardMap.Exclamation:
word.nextSpace = false
sentence.words.push(word)
sentence.words.push(cloneDeep({...DefaultArticleWord, name: v, nextSpace: true, isSymbol: true}))
section.push({
text: '',
translate: '',
words: []
})
sentence = section[section.length - 1]
word = cloneDeep(DefaultArticleWord)
break
case keyboardMap.QuoteLeft:
let nearSymbolPosition = null
let indexs = {
a: -1,
b: -1,
c: -1
}
//TODO 可以优化成for+break
sections.toReversed().map((sectionItem, a) => {
sectionItem.toReversed().map((sentenceItem, b) => {
sentenceItem.words.toReversed().map((wordItem, c) => {
if (wordItem.symbolPosition !== '' && nearSymbolPosition === null) {
nearSymbolPosition = wordItem.symbolPosition
indexs = {a, b, c}
}
})
})
})
if (nearSymbolPosition === 'end' || nearSymbolPosition === null) {
sentence.words.push(cloneDeep({
...DefaultArticleWord,
name: v,
nextSpace: false,
isSymbol: true,
symbolPosition: 'start'
}))
word = cloneDeep(DefaultArticleWord)
} else {
let addCurrent = false
sentence.words.toReversed().map((wordItem, c) => {
if (wordItem.symbolPosition === 'start' && !addCurrent) {
addCurrent = true
}
})
if (addCurrent) {
//`“这是私人谈话”`这种没有结束符号的情况swtich走不到结束符号也就不会起新的一行
if (word.name.length) {
sentence.words.push(word)
}
sentence.words.push(cloneDeep({
...DefaultArticleWord,
name: v,
nextSpace: true,
isSymbol: true,
symbolPosition: 'end'
}))
word = cloneDeep(DefaultArticleWord)
} else {
let lastSentence = section[section.length - 2]
lastSentence.words[lastSentence.words.length - 1].nextSpace = false
lastSentence.words.push(cloneDeep({
...DefaultArticleWord,
name: v,
nextSpace: true,
isSymbol: true,
symbolPosition: 'end'
}))
}
}
break
case '\n':
//如果是空行,就删除
if (!sentence.words.length) {
section.pop()
sentence = section[section.length - 1]
}
//判断name有没有值有值说明最后一句没有结束符正常来说一句话以句号或逗号结尾
if (word.name.length) {
sentence.words.push(word)
}
if (i !== arr.length - 1) {
sections.push([])
section = sections[sections.length - 1]
section.push({
text: '',
translate: '',
words: []
})
sentence = section[section.length - 1]
word = cloneDeep(DefaultArticleWord)
}
break
default:
// if (v === '2')debugger
word.name += v
break
}
})
sections = sections.filter(sectionItem => sectionItem.length)
sections.map((sectionItem, a) => {
sectionItem.map((sentenceItem, b) => {
sentenceItem.text = sentenceItem.words.reduce((previousValue: string, currentValue) => {
previousValue += currentValue.name + (currentValue.nextSpace ? ' ' : '')
return previousValue
}, '')
})
})
return {
newText: text,
sections
}
}
export function splitCNArticle(article: string, lang: string = 'cn', keyboardMap: KeyboardMap = CnKeyboardMap): Sentence[][] {
let sections: Sentence[][] = []
let section: Sentence[] = []
let sentence: Sentence = {
text: '',
translate: '',
words: []
}
section.push(sentence)
sections.push(section)
let word = cloneDeep({...DefaultArticleWord, name: '', nextSpace: true});
//去除头和尾部的空格
article = article.trim()
//加\n用于添加最后一段
article += '\n'
// console.log('articles', articles)
article.split('').map((v, i, arr) => {
switch (v) {
case ' ':
if (word.name) {
sentence.words.push(word)
word = cloneDeep(DefaultArticleWord)
}
break
case keyboardMap.Period:
case keyboardMap.Comma:
case keyboardMap.Slash:
case keyboardMap.Exclamation:
word.nextSpace = false
sentence.words.push(word)
sentence.words.push(cloneDeep({...DefaultArticleWord, name: v, nextSpace: true}))
section.push({
text: '',
translate: '',
words: []
})
sentence = section[section.length - 1]
word = cloneDeep(DefaultArticleWord)
break
case keyboardMap.QuoteLeft:
sentence.words.push(cloneDeep({
...DefaultArticleWord,
name: v,
nextSpace: false,
isSymbol: true,
symbolPosition: 'start'
}))
word = cloneDeep(DefaultArticleWord)
break
case keyboardMap.QuoteRight:
let nearSymbolPosition = null
//TODO 可以优化成for+break
sections.toReversed().map((sectionItem, a) => {
sectionItem.toReversed().map((sentenceItem, b) => {
sentenceItem.words.toReversed().map((wordItem, c) => {
if (wordItem.symbolPosition !== '' && nearSymbolPosition === null) {
nearSymbolPosition = wordItem.symbolPosition
}
})
})
})
if (nearSymbolPosition === 'start' || nearSymbolPosition === null) {
let addCurrent = false
sentence.words.toReversed().map((wordItem, c) => {
if (wordItem.symbolPosition === 'start' && !addCurrent) {
addCurrent = true
}
})
if (addCurrent) {
//`“这是私人谈话”`这种没有结束符号的情况swtich走不到结束符号也就不会起新的一行
if (word.name.length) {
sentence.words.push(word)
}
sentence.words.push(cloneDeep({
...DefaultArticleWord,
name: v,
nextSpace: true,
isSymbol: true,
symbolPosition: 'end'
}))
word = cloneDeep(DefaultArticleWord)
} else {
let lastSentence = section[section.length - 2]
lastSentence.words[lastSentence.words.length - 1].nextSpace = false
lastSentence.words.push(cloneDeep({
...DefaultArticleWord,
name: v,
nextSpace: true,
isSymbol: true,
symbolPosition: 'end'
}))
}
}
break
case '\n':
//如果是空行,就删除
if (!sentence.words.length) {
section.pop()
sentence = section[section.length - 1]
}
//判断name有没有值有值说明最后一句没有结束符正常来说一句话以句号或逗号结尾
if (word.name.length) {
sentence.words.push(word)
}
if (i !== arr.length - 1) {
sections.push([])
section = sections[sections.length - 1]
section.push({
text: '',
translate: '',
words: []
})
sentence = section[section.length - 1]
word = cloneDeep(DefaultArticleWord)
}
break
default:
word.name += v
break
}
})
// console.log(cloneDeep(sections))
sections = sections.filter(sectionItem => sectionItem.length)
sections.map((sectionItem, a) => {
sectionItem.map((sentenceItem, b) => {
sentenceItem.text = sentenceItem.words.reduce((previousValue: string, currentValue) => {
previousValue += currentValue.name + (currentValue.nextSpace ? ' ' : '')
return previousValue
}, '')
})
})
return sections
}
export function getSplitTranslateText(article: string) {
let sections = splitCNArticle(article)
let str = ''
if (sections.length) {
sections.map((sectionItem) => {
sectionItem.map((sentenceItem) => {
str += sentenceItem.text + '\n'
})
str += '\n'
})
}
return str
}
export function splitEnArticle2(text: string) {
export function splitEnArticle(text: string): { sections: Sentence[][], newText: string } {
console.time()
let keyboardMap = EnKeyboardMap
text = "On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S. Millions attended the Inauguration. It was Sunday? I never get up early on Sundays! I sometimes stay in bed until lunchtime. Last Sunday I got up very late. I looked out of the window. It was dark outside. 'What a day!' I thought. 'It's raining again.' Just then, the telephone rang. It was my aunt Lucy. 'I've just arrived by train,' she said. 'I'm coming to see you.'\n 'But I'm still having breakfast,' I said\n 'What are you doing?' she asked.\n 'I'm having breakfast,' I repeated.\n 'Dear me,' she said. 'Do you always get up so late? It's one o'clock!'"
// text = "On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S. Millions attended the Inauguration. It was Sunday? I never get up early on Sundays! I sometimes stay in bed until lunchtime. Last Sunday I got up very late. I looked out of the window. It was dark outside. 'What a day!' I thought. 'It's raining again.' Just then, the telephone rang. It was my aunt Lucy. 'I've just arrived by train,' she said. 'I'm coming to see you.'\n 'But I'm still having breakfast,' I said\n 'What are you doing?' she asked.\n 'I'm having breakfast,' I repeated.\n 'Dear me,' she said. 'Do you always get up so late? It's one o'clock'"
// text = "Mr.James Scott has a garage in Silbury and now he has just bought another garage in Pinhurst. Pinhurst is only five miles from Silbury, but Mr. Scott cannot get a telephone for his new garage, so he has just bought twelve pigeons. Yesterday, a pigeon carried the first message from Pinhurst to Silbury. The bird covered the distance in three minutes. Up to now, Mr.Scott has sent a great many requests for spare parts and other urgent messages from one garage to the other. In this way, he has begun his own private 'telephone' service."
// text = "How does the older investor differ in his approach to investment from the younger investor?\nThere is no shortage of tipsters around offering 'get-rich-quick' opportunities. But if you are a serious private investor, leave the Las Vegas mentality to those with money to fritter. The serious investor needs a proper 'portfolio' -- a well-planned selection of investments, with a definite structure and a clear aim. But exactly how does a newcomer to the stock market go about achieving that?\nWell, if you go to five reputable stock brokers and ask them what you should do with your money, you're likely to get five different answers, -- even if you give all the relevant information about your age age, family, finances and what you want from your investments. Moral? There is no one 'right' way to structure a portfolio. However, there are undoubtedly some wrong ways, and you can be sure that none of our five advisers would have suggested sinking all (or perhaps any) of your money into Periwigs*.\nSo what should you do? We'll assume that you have sorted out the basics -- like mortgages, pensions, insurance and access to sufficient cash reserves. You should then establish your own individual aims. These are partly a matter of personal circumstances, partly a matter of psychology.\nFor instance, if you are older you have less time to recover from any major losses, and you may well wish to boost your pension income. So preserving your capital and generating extra income are your main priorities. In this case, you'd probably construct a portfolio with some shares (but not high risk ones), along with gilts, cash deposits, and perhaps convertibles or the income shares of split capital investment trusts.\nIf you are younger, and in a solid financial position, you may decide to take an aggressive approach -- but only if you're blessed with a sanguine disposition and won't suffer sleepless nights over share prices. If portfolio, alongside your more pedestrian in vestments. Once you have decided on your investment aims, you can then decide where to put your money. The golden rule here is spread your risk -- if you put all of your money into Periwigs International, you're setting yourself up as a hostage to fortune.\n*'Periwigs' is the name of a fictitious company.\nINVESTOR'S CHRONICLE, March 23 1990"
//去除头和尾部的空格
text = text.trim()
// text = text.replaceAll(``, '"')
@@ -367,140 +55,151 @@ export function splitEnArticle2(text: string) {
// text = text.replaceAll(`"clock`, `'clock`)
// console.log('splitEnArticle', text)
let optional_options = {newline_boundaries: true};
let sentences = tokenizer.sentences(text, optional_options);
console.log(sentences);
let sections: Sentence[][] = []
text.split('\n').map((rowSection, i) => {
text && text.split('\n').map((rowSection, i) => {
let section: Sentence[] = []
sections.push(section)
rowSection = rowSection.trim()
// console.log(split(rowSection,{
// SeparatorParser:{
// separatorCharacters:['.']
// }
// }))
return
// let section: Sentence[] = []
// sections.push(section)
//
// // console.log('rowSection', rowSection)
// rowSection.split('.').map((rowSentence, j) => {
//
// rowSentence = rowSentence.trim()
// if (rowSentence) {
// //如果以.结尾,那么最后一项为空,忽略
// // if (rowSentence && rowSentence[rowSentence.length - 1] !== "'") {
// // rowSentence += '.'
// // }
// if (rowSentence === '"') {
// let lastSentence = section[section.length - 1]
// lastSentence.text += '"'
// } else {
// console.log('rowSentence', rowSentence)
// rowSentence += '.'
// let sentence: Sentence = {
// text: rowSentence,
// translate: '',
// words: []
// }
// section.push(sentence)
// }
//
// // rowSentence.split('').map((v, i, arr) => {
// //
// // })
// // let word = cloneDeep({...DefaultArticleWord, name: '', nextSpace: true});
// // rowSentence.split('').map((v, i, arr) => {
// // switch (v) {
// // case ' ':
// // if (word.name) {
// // sentence.words.push(word)
// // word = cloneDeep(DefaultArticleWord)
// // }
// // break
// // case keyboardMap.Period:
// // word.nextSpace = false
// // sentence.words.push(word)
// // sentence.words.push(cloneDeep({...DefaultArticleWord, name: v, nextSpace: true, isSymbol: true}))
// // word = cloneDeep(DefaultArticleWord)
// // break
// // case keyboardMap.Comma:
// // case keyboardMap.Slash:
// // case keyboardMap.Exclamation:
// // word.nextSpace = false
// // sentence.words.push(word)
// // sentence.words.push(cloneDeep({...DefaultArticleWord, name: v, nextSpace: true, isSymbol: true}))
// // section.push({
// // text: '',
// // translate: '',
// // words: []
// // })
// // sentence = section[section.length - 1]
// // word = cloneDeep(DefaultArticleWord)
// // break
// // case keyboardMap.QuoteLeft:
// // let nearSymbolPosition = null
// // //TODO 可以优化成for+break
// // section.toReversed().map((sentenceItem, b) => {
// // sentenceItem.words.toReversed().map((wordItem, c) => {
// // if (wordItem.symbolPosition !== '' && nearSymbolPosition === null) {
// // nearSymbolPosition = wordItem.symbolPosition
// // }
// // })
// // })
// //
// // if (nearSymbolPosition === 'end' || nearSymbolPosition === null) {
// // sentence.words.push(cloneDeep({
// // ...DefaultArticleWord,
// // name: v,
// // nextSpace: false,
// // isSymbol: true,
// // symbolPosition: 'start'
// // }))
// // word = cloneDeep(DefaultArticleWord)
// // } else {
// // let addCurrent = false
// // sentence.words.toReversed().map((wordItem, c) => {
// // if (wordItem.symbolPosition === 'start' && !addCurrent) {
// // addCurrent = true
// // }
// // })
// // if (addCurrent) {
// // //`“这是私人谈话”`这种没有结束符号的情况swtich走不到结束符号也就不会起新的一行
// // if (word.name.length) {
// // sentence.words.push(word)
// // }
// // sentence.words.push(cloneDeep({
// // ...DefaultArticleWord,
// // name: v,
// // nextSpace: true,
// // isSymbol: true,
// // symbolPosition: 'end'
// // }))
// // word = cloneDeep(DefaultArticleWord)
// // } else {
// // debugger
// // let lastSentence = section[section.length - 2]
// // lastSentence.words[lastSentence.words.length - 1].nextSpace = false
// // lastSentence.words.push(cloneDeep({
// // ...DefaultArticleWord,
// // name: v,
// // nextSpace: true,
// // isSymbol: true,
// // symbolPosition: 'end'
// // }))
// // }
// // }
// // break
// // default:
// // // if (v === '2')debugger
// // word.name += v
// // break
// // }
// // })
//
// }
// })
let doc = nlp.tokenize(rowSection)
let sentences = doc.json()
// console.log('--')
sentences.map(sentenceRow => {
let sentence: Sentence = {
text: sentenceRow.text,
// text: '',
translate: '',
words: []
}
section.push(sentence)
const checkQuote = (pre: string) => {
let nearSymbolPosition = null
//TODO 可以优化成for+break
section.toReversed().map((sentenceItem, b) => {
sentenceItem.words.toReversed().map((wordItem, c) => {
if (wordItem.symbolPosition !== '' && nearSymbolPosition === null) {
nearSymbolPosition = wordItem.symbolPosition
}
})
})
let word3: ArticleWord = {
...DefaultArticleWord,
name: pre,
nextSpace: false,
isSymbol: true,
symbolPosition: ''
};
if (nearSymbolPosition === 'end' || nearSymbolPosition === null) {
word3.symbolPosition = 'start'
sentence.words.push(word3)
} else {
sentence.words[sentence.words.length - 1].nextSpace = false
word3.symbolPosition = 'end'
word3.nextSpace = true
let addCurrent = false
sentence.words.toReversed().map((wordItem, c) => {
if (wordItem.symbolPosition === 'start' && !addCurrent) {
addCurrent = true
}
})
if (addCurrent) {
sentence.words.push(word3)
} else {
// 'Do you always get up so late? It's one o'clock!' 会被断成两句
let lastSentence = section[section.length - 2]
lastSentence.words = lastSentence.words.concat(sentence.words)
lastSentence.words.push(word3)
section.pop()
}
}
}
const checkSymbol = (post: string, nextSpace: boolean = true) => {
switch (post) {
case keyboardMap.Period:
case keyboardMap.Comma:
case keyboardMap.Slash:
case keyboardMap.Exclamation:
sentence.words[sentence.words.length - 1].nextSpace = false
let word2 = cloneDeep({
...DefaultArticleWord,
name: post,
isSymbol: true,
nextSpace
});
sentence.words.push(word2)
break
case keyboardMap.QuoteLeft:
case ')':
checkQuote(post)
break
case `.'`:
case `!'`:
case `?'`:
case `,'`:
case `*'`:
post.split('').map(v => {
checkSymbol(v, false)
})
break
//类似于这种的“' -- ”的。需要保留空格用了一个占位符才处理因为每个符号都会把前面的那个字符的nextSpace改为false
case ' ':
sentence.words[sentence.words.length - 1].nextSpace = true
let word3 = cloneDeep({
...DefaultArticleWord,
name: 'placeholder',
isSymbol: true,
nextSpace: false,
});
sentence.words.push(word3)
break
default:
// console.log('post', post)
//这里多半是一些奇怪的连接符之类的
if (post.length > 1) {
post.split('').map(v => {
checkSymbol(v, false)
})
} else {
sentence.words[sentence.words.length - 1].nextSpace = false
let word3 = cloneDeep({
...DefaultArticleWord,
name: post,
isSymbol: true,
nextSpace: false,
});
sentence.words.push(word3)
}
break
}
}
sentenceRow.terms.map(v => {
// console.log('v', v)
if (v.text) {
let pre: string = v.pre.trim()
if (pre) {
checkQuote(pre)
}
let word = cloneDeep({...DefaultArticleWord, name: v.text, nextSpace: true});
sentence.words.push(word)
let post: string = v.post
//判断是不是等于空,因为正常的词后面都会有个空格。这种不需要处理。
if (post && post !== ' ') {
checkSymbol(post)
}
}
})
//去除空格占位符
sentence.words = sentence.words.filter(v => v.name !== 'placeholder')
})
})
// sections = sections.filter(sectionItem => sectionItem.length)
@@ -513,13 +212,63 @@ export function splitEnArticle2(text: string) {
// })
// })
// console.log(sections)
// console.log('--')
//
// console.log(split(`'What a day!' I thought.`,{
// SeparatorParser:{
// separatorCharacters:['!']
// }
// }))
// console.log('--')
// console.log(split(`On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S. Millions attended the Inauguration.`,))
// console.timeEnd()
return {
newText: text,
sections
}
}
export function splitCNArticle(text: string): Sentence[][] {
// text = "飞机误点了,侦探们在机场等了整整一上午。他们正期待从南非来的一个装着钻石的贵重包裹。数小时以前,有人向警方报告,说有人企图偷走这些钻石。当飞机到达时,一些侦探等候在主楼内,另一些侦探则守候在停机坪上。有两个人把包裹拿下飞机,进了海关。这时两个侦探把住门口,另外两个侦探打开了包裹。令他们吃惊的是,那珍贵的包裹里面装的全是石头和沙子!"
// text = "那是 4.4 个星期天?而在星期天我是从来不早起的,有时我要一直躺到吃午饭的时候。上个星期天,我起得很晚。我望望窗外,外面一片昏暗。“鬼天气!”我想,“又下雨了。”正在这时,电话铃响了。是我姑母露西打来的。“我刚下火车,”她说,“我这就来看你。”\n “但我还在吃早饭,”我说。\n “你在干什么?”她问道。\n “我正在吃早饭,”我又说了一遍。\n “天啊”她说“你总是起得这么晚吗现在已经1点钟了”"
//去除头和尾部的空格
text = text.trim()
let sections: Sentence[][] = []
text && text.split('\n').map((rowSection, i) => {
let section: Sentence[] = []
sections.push(section)
rowSection = rowSection.trim()
let sentences = split(rowSection)
sentences.map(sentenceRow => {
let row = sentenceRow.raw
let sentence: Sentence = {
text: row,
// text: '',
translate: '',
words: []
}
section.push(sentence)
// console.log('s', )
if (row) {
//这个库总是会把反引号给断句到下一行
if (row[0] === "”") {
sentence.text = row.substr(1)
let lastSentence = section[section.length - 2]
lastSentence.text += "”"
if (!sentence.text) {
section.pop()
}
}
}
// console.log('sentence', sentenceRow)
})
})
console.log('sections', sections)
return sections
}
export function getSplitTranslateText(article: string) {
let sections = splitCNArticle(article)
let str = ''
if (sections.length) {
sections.map((sectionItem) => {
sectionItem.map((sentenceItem) => {
str += sentenceItem.text + '\n'
})
str += '\n'
})
}
return str
}

View File

@@ -1,54 +0,0 @@
let text = `On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S. Millions attended the Inauguration. It was Sunday? I never get up early on Sundays! I sometimes stay in bed until lunchtime. Last Sunday I got up very late. I looked out of the window. It was dark outside. 'What a day!' I thought. 'It's raining again.' Just then, the telephone rang. It was my aunt Lucy. 'I've just arrived by train,' she said. 'I'm coming to see you.'\n 'But I'm still having breakfast,' I said\n 'What are you doing?' she asked.\n 'I'm having breakfast,' I repeated.\n 'Dear me,' she said. 'Do you always get up so late? It's one o'clock!'`;
// text = `How does the older investor differ in his approach to investment from the younger investor?\nThere is no shortage of tipsters around offering 'get-rich-quick' opportunities. But if you are a serious private investor, leave the Las Vegas mentality to those with money to fritter. The serious investor needs a proper 'portfolio' -- a well-planned selection of investments, with a definite structure and a clear aim. But exactly how does a newcomer to the stock market go about achieving that?\nWell, if you go to five reputable stock brokers and ask them what you should do with your money, you're likely to get five different answers, -- even if you give all the relevant information about your age age, family, finances and what you want from your investments. Moral? There is no one 'right' way to structure a portfolio. However, there are undoubtedly some wrong ways, and you can be sure that none of our five advisers would have suggested sinking all (or perhaps any) of your money into Periwigs*.\nSo what should you do? We'll assume that you have sorted out the basics -- like mortgages, pensions, insurance and access to sufficient cash reserves. You should then establish your own individual aims. These are partly a matter of personal circumstances, partly a matter of psychology.\nFor instance, if you are older you have less time to recover from any major losses, and you may well wish to boost your pension income. So preserving your capital and generating extra income are your main priorities. In this case, you'd probably construct a portfolio with some shares (but not high risk ones), along with gilts, cash deposits, and perhaps convertibles or the income shares of split capital investment trusts.\nIf you are younger, and in a solid financial position, you may decide to take an aggressive approach -- but only if you're blessed with a sanguine disposition and won't suffer sleepless nights over share prices. If portfolio, alongside your more pedestrian in vestments. Once you have decided on your investment aims, you can then decide where to put your money. The golden rule here is spread your risk -- if you put all of your money into Periwigs International, you're setting yourself up as a hostage to fortune.\n*'Periwigs' is the name of a fictitious company.\nINVESTOR'S CHRONICLE, March 23 1990`;
console.time()
text = text.replaceAll(`'`, '"')
// 将缩写词的双引号替换回单引号
text = text.replaceAll(`"t`, `'t`)
text = text.replaceAll(`"s`, `'s`)
text = text.replaceAll(`"S`, `'S`)
text = text.replaceAll(`"m`, `'m`)
text = text.replaceAll(`"d`, `'d`)
text = text.replaceAll(`"ve`, `'ve`)
text = text.replaceAll(`"re`, `'re`)
text = text.replaceAll(`"clock`, `'clock`)
// var Tokenizer = require('sentence-tokenizer');
// var tokenizer = new Tokenizer('Chuck');
// tokenizer.setEntry(v);
// console.log(tokenizer.getSentences());
var tokenizer = require('sbd');
var optional_options = {
newline_boundaries: true
};
// // // text = "On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S. Millions attended the Inauguration.";
// // text.split('\n').map(v=>{
// // })
var sentences = tokenizer.sentences(text, optional_options);
console.log(sentences);
// Load wink-nlp package.
const winkNLP = require('wink-nlp');
// Load english language model.
const model = require('wink-eng-lite-web-model');
// Instantiate winkNLP.
const nlp = winkNLP(model);
// Obtain "its" helper to extract item properties.
const its = nlp.its;
// Obtain "as" reducer helper to reduce a collection.
const as = nlp.as;
// NLP Code.
// text = 'Hello World🌎! How are you?';
const doc = nlp.readDoc(text);
// console.log( doc.out() );
// -> Hello World🌎! How are you?
doc.sentences().each(v => {
console.log(v.out());
// console.log(v.tokens().out());
})
console.timeEnd()

199
test/package-lock.json generated
View File

@@ -1,199 +0,0 @@
{
"name": "test",
"version": "1.0.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "test",
"version": "1.0.0",
"license": "ISC",
"dependencies": {
"sbd": "^1.0.19",
"sentence-tokenizer": "^1.0.1",
"wink-eng-lite-web-model": "^1.5.2",
"wink-nlp": "^1.14.3"
}
},
"node_modules/debug": {
"version": "4.1.0",
"resolved": "https://registry.npmmirror.com/debug/-/debug-4.1.0.tgz",
"integrity": "sha512-heNPJUJIqC+xB6ayLAMHaIrmN9HKa7aQO8MGqKpvCA+uJYVcvR6l5kgdrhRuwPFHU7P5/A1w0BjByPHwpfTDKg==",
"deprecated": "Debug versions >=3.2.0 <3.2.7 || >=4 <4.3.1 have a low-severity ReDos regression when used in a Node.js environment. It is recommended you upgrade to 3.2.7 or 4.3.1. (https://github.com/visionmedia/debug/issues/797)",
"dependencies": {
"ms": "^2.1.1"
}
},
"node_modules/deepmerge": {
"version": "4.3.1",
"resolved": "https://registry.npmmirror.com/deepmerge/-/deepmerge-4.3.1.tgz",
"integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/dom-serializer": {
"version": "2.0.0",
"resolved": "https://registry.npmmirror.com/dom-serializer/-/dom-serializer-2.0.0.tgz",
"integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
"dependencies": {
"domelementtype": "^2.3.0",
"domhandler": "^5.0.2",
"entities": "^4.2.0"
}
},
"node_modules/domelementtype": {
"version": "2.3.0",
"resolved": "https://registry.npmmirror.com/domelementtype/-/domelementtype-2.3.0.tgz",
"integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw=="
},
"node_modules/domhandler": {
"version": "5.0.3",
"resolved": "https://registry.npmmirror.com/domhandler/-/domhandler-5.0.3.tgz",
"integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
"dependencies": {
"domelementtype": "^2.3.0"
},
"engines": {
"node": ">= 4"
}
},
"node_modules/domutils": {
"version": "3.1.0",
"resolved": "https://registry.npmmirror.com/domutils/-/domutils-3.1.0.tgz",
"integrity": "sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==",
"dependencies": {
"dom-serializer": "^2.0.0",
"domelementtype": "^2.3.0",
"domhandler": "^5.0.3"
}
},
"node_modules/entities": {
"version": "4.5.0",
"resolved": "https://registry.npmmirror.com/entities/-/entities-4.5.0.tgz",
"integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==",
"engines": {
"node": ">=0.12"
}
},
"node_modules/escape-string-regexp": {
"version": "4.0.0",
"resolved": "https://registry.npmmirror.com/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
"integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
"engines": {
"node": ">=10"
}
},
"node_modules/htmlparser2": {
"version": "8.0.2",
"resolved": "https://registry.npmmirror.com/htmlparser2/-/htmlparser2-8.0.2.tgz",
"integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==",
"dependencies": {
"domelementtype": "^2.3.0",
"domhandler": "^5.0.3",
"domutils": "^3.0.1",
"entities": "^4.4.0"
}
},
"node_modules/is-plain-object": {
"version": "5.0.0",
"resolved": "https://registry.npmmirror.com/is-plain-object/-/is-plain-object-5.0.0.tgz",
"integrity": "sha512-VRSzKkbMm5jMDoKLbltAkFQ5Qr7VDiTFGXxYFXXowVj387GeGNOCsOH6Msy00SGZ3Fp84b1Naa1psqgcCIEP5Q==",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/ms": {
"version": "2.1.3",
"resolved": "https://registry.npmmirror.com/ms/-/ms-2.1.3.tgz",
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="
},
"node_modules/nanoid": {
"version": "3.3.6",
"resolved": "https://registry.npmmirror.com/nanoid/-/nanoid-3.3.6.tgz",
"integrity": "sha512-BGcqMMJuToF7i1rt+2PWSNVnWIkGCU78jBG3RxO/bZlnZPK2Cmi2QaffxGO/2RvWi9sL+FAiRiXMgsyxQ1DIDA==",
"bin": {
"nanoid": "bin/nanoid.cjs"
},
"engines": {
"node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1"
}
},
"node_modules/parse-srcset": {
"version": "1.0.2",
"resolved": "https://registry.npmmirror.com/parse-srcset/-/parse-srcset-1.0.2.tgz",
"integrity": "sha512-/2qh0lav6CmI15FzA3i/2Bzk2zCgQhGMkvhOhKNcBVQ1ldgpbfiNTVslmooUmWJcADi1f1kIeynbDRVzNlfR6Q=="
},
"node_modules/picocolors": {
"version": "1.0.0",
"resolved": "https://registry.npmmirror.com/picocolors/-/picocolors-1.0.0.tgz",
"integrity": "sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ=="
},
"node_modules/postcss": {
"version": "8.4.30",
"resolved": "https://registry.npmmirror.com/postcss/-/postcss-8.4.30.tgz",
"integrity": "sha512-7ZEao1g4kd68l97aWG/etQKPKq07us0ieSZ2TnFDk11i0ZfDW2AwKHYU8qv4MZKqN2fdBfg+7q0ES06UA73C1g==",
"dependencies": {
"nanoid": "^3.3.6",
"picocolors": "^1.0.0",
"source-map-js": "^1.0.2"
},
"engines": {
"node": "^10 || ^12 || >=14"
}
},
"node_modules/sanitize-html": {
"version": "2.11.0",
"resolved": "https://registry.npmmirror.com/sanitize-html/-/sanitize-html-2.11.0.tgz",
"integrity": "sha512-BG68EDHRaGKqlsNjJ2xUB7gpInPA8gVx/mvjO743hZaeMCZ2DwzW7xvsqZ+KNU4QKwj86HJ3uu2liISf2qBBUA==",
"dependencies": {
"deepmerge": "^4.2.2",
"escape-string-regexp": "^4.0.0",
"htmlparser2": "^8.0.0",
"is-plain-object": "^5.0.0",
"parse-srcset": "^1.0.2",
"postcss": "^8.3.11"
}
},
"node_modules/sbd": {
"version": "1.0.19",
"resolved": "https://registry.npmmirror.com/sbd/-/sbd-1.0.19.tgz",
"integrity": "sha512-b5RyZMGSrFuIB4AHdbv12uYHS8YGEJ36gtuvG3RflbJGY+T0dXmAL0E4vZjQqT2RsX0v+ZwVqhV2zsGr5aFK9w==",
"dependencies": {
"sanitize-html": "^2.3.2"
}
},
"node_modules/sentence-tokenizer": {
"version": "1.0.1",
"resolved": "https://registry.npmmirror.com/sentence-tokenizer/-/sentence-tokenizer-1.0.1.tgz",
"integrity": "sha512-nmKF6fXmgZouD3AfWgYCmr35g7g7ObtbTlEFRVx2oj/ptrCOrosixrhXhWUdnPRdze7xhMf4IcliAa021BMXTA==",
"dependencies": {
"debug": "4.1.0"
}
},
"node_modules/source-map-js": {
"version": "1.0.2",
"resolved": "https://registry.npmmirror.com/source-map-js/-/source-map-js-1.0.2.tgz",
"integrity": "sha512-R0XvVJ9WusLiqTCEiGCmICCMplcCkIwwR11mOSD9CR5u+IXYdiseeEuXCVAjS54zqwkLcPNnmU4OeJ6tUrWhDw==",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/wink-eng-lite-web-model": {
"version": "1.5.2",
"resolved": "https://registry.npmmirror.com/wink-eng-lite-web-model/-/wink-eng-lite-web-model-1.5.2.tgz",
"integrity": "sha512-qYxAtJi0DuF0mVNr3wwdNj5u+jG9xH8BH45MbNLGzwQrUUpgKs87GfBLup0+ZfuXcpunw6AsgL7xtdl5vtUd2w==",
"engines": {
"node": ">=16.0.0"
},
"peerDependencies": {
"wink-nlp": ">1.8.1"
}
},
"node_modules/wink-nlp": {
"version": "1.14.3",
"resolved": "https://registry.npmmirror.com/wink-nlp/-/wink-nlp-1.14.3.tgz",
"integrity": "sha512-lvY5iCs3T8I34F8WKS70+2P0U9dWLn3vdPf/Z+m2VK14N7OmqnPzmHfh3moHdusajoQ37Em39z0IZB9K4x/96A=="
}
}
}

View File

@@ -1,18 +0,0 @@
{
"name": "test",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"sbd": "^1.0.19",
"sentence-tokenizer": "^1.0.1",
"wink-eng-lite-web-model": "^1.5.2",
"wink-nlp": "^1.14.3"
}
}