JavaScript代码过滤掉字符串中的常见单词
JavaScript code to filter out common words in a string
我正在尝试构建读取一个字符串(说英语文本的句子)的JavaScript代码,然后输出另一个字符串(逗号分隔)的单词是"不常见的"。比如:
var sentence="The dog ran to the other side of the field.";
var common_words="the, it is, we all, a, an, by, to, you, me, he, she, they, we, how, it, i, are, to, for, of";
—一些JavaScript代码—
var uncommon_words="dog, ran, other, side, field";
我该怎么做?
给你:
function getUncommon(sentence, common) {
var wordArr = sentence.match(/'w+/g),
commonObj = {},
uncommonArr = [],
word, i;
common = common.split(',');
for ( i = 0; i < common.length; i++ ) {
commonObj[ common[i].trim() ] = true;
}
for ( i = 0; i < wordArr.length; i++ ) {
word = wordArr[i].trim().toLowerCase();
if ( !commonObj[word] ) {
uncommonArr.push(word);
}
}
return uncommonArr;
}
现场演示: http://jsfiddle.net/simevidas/knXkS/
要删除的单词称为停止词,即:
["a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your", "ain't", "aren't", "can't", "could've", "couldn't", "didn't", "doesn't", "don't", "hasn't", "he'd", "he'll", "he's", "how'd", "how'll", "how's", "i'd", "i'll", "i'm", "i've", "isn't", "it's", "might've", "mightn't", "must've", "mustn't", "shan't", "she'd", "she'll", "she's", "should've", "shouldn't", "that'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "wasn't", "we'd", "we'll", "we're", "weren't", "what'd", "what's", "when'd", "when'll", "when's", "where'd", "where'll", "where's", "who'd", "who'll", "who's", "why'd", "why'll", "why's", "won't", "would've", "wouldn't", "you'd", "you'll", "you're", "you've"]
是来源:http://99webtools.com/list-english-stop-words.php
所以你的代码应该是function getNoneStopWords(sentence) {
var common = getStopWords();
var wordArr = sentence.match(/'w+/g),
commonObj = {},
uncommonArr = [],
word, i;
for (i = 0; i < common.length; i++) {
commonObj[ common[i].trim() ] = true;
}
for (i = 0; i < wordArr.length; i++) {
word = wordArr[i].trim().toLowerCase();
if (!commonObj[word]) {
uncommonArr.push(word);
}
}
return uncommonArr;
}
function getStopWords() {
return ["a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your", "ain't", "aren't", "can't", "could've", "couldn't", "didn't", "doesn't", "don't", "hasn't", "he'd", "he'll", "he's", "how'd", "how'll", "how's", "i'd", "i'll", "i'm", "i've", "isn't", "it's", "might've", "mightn't", "must've", "mustn't", "shan't", "she'd", "she'll", "she's", "should've", "shouldn't", "that'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "wasn't", "we'd", "we'll", "we're", "weren't", "what'd", "what's", "when'd", "when'll", "when's", "where'd", "where'll", "where's", "who'd", "who'll", "who's", "why'd", "why'll", "why's", "won't", "would've", "wouldn't", "you'd", "you'll", "you're", "you've"];
}
这个怎么样?
sentence.replace(/'b(?:the|it is|we all|an?|by|to|you|[mh]e|she|they|we...)'b/ig, '');
这将从你的句子中删除所有常用词。只要按你想要的方式分割剩下的字符串
我想从这里开始:
var sentence_arr = sentence.split(/(?='w)'b|'W/);
var common_arr = common_words.split(', ');
var uncommon_arr = array();
for(var i = 0; i < sentence_arr.length; i++) {
for ( var j = 0; j < common_arr.length; j++ ) {
if ( sentence_arr[i].toLowerCase() != common_arr[j].toLowerCase() ) {
uncommon_arr.push(sentence_arr[i].toLowerCase());
}
}
var uncommon_words = uncommon_arr.join(', ');
完全没有经过测试,但关键是你将两个句子分开,并分别检查每个单词与列表中的每个成员的对比。有点幼稚,完全不能扩展,但是对于像这样的小例子来说是可以的。
首先构建一个常用词的关联数组,然后对序列进行标记以输出其中未包含的任何单词。例如
var excluded = new Object();
common_words = common_words.split(",");
for (var i in common_words) {
excluded[common_words[i].trim().toLowerCase()] = true;
}
var result = new Array();
var match = sentence.match(/'w+/g);
for (var i in match) {
if (!excluded[match[i].toLowerCase()]) {
result.push(match[i]);
}
}
var uncommon_words = result.join(", ");
String#diff
函数返回一个差异(不常见术语)列表。术语可以以数组或字符串的形式提供。
你叫它:sentence.diff(terms)
。下面是一个单元测试:
var sentence = 'The dog ran to the other side of the field.';
var terms = 'the, it is, we all, a, an, by, to, you, me, he, she, they, we, how, it, i, are, to, for, of';
// NOTE: The "terms" variable could also be an array.
(sentence.diff(terms).toString() === 'dog,ran,other,side,field')
? console.log('pass')
: console.log('fail');
下面是'字符串。diff'函数定义:
String.prototype.diff = function(terms){
if (!terms) {
return [];
}
if (typeof terms === 'string') {
terms = terms.split(/,['s]*/);
}
if (typeof terms !== 'object' || !Array.isArray(terms)) {
return [];
}
terms = terms.map(function(term){
return term.toLowerCase();
});
var words = this.split(/['W]/).filter(function(word){
return word.length;
});
return words.filter(function(word){
return terms.indexOf(word.toLowerCase()) < 0;
});
};
相关文章:
- 单击按钮时显示随机字符串
- jquery从2个json字符串构建一个复选框表单
- 从表单中动态生成一个字符串,传递给通过AJAX加载的PHP文件
- 将字符串中的单引号替换为双引号
- 修改表单提交字符串
- 表单提交字符串concat/modify
- 当用户单击按钮(在光标位置)时,在输入字段中添加一个文本字符串
- 使用jquery或regex,带点但不带点的拆分字符串位于单引号中
- 如何在字符串输入的 Html 元素中的双引号内编写单引号
- JavaScript 相等操作对单引号字符串和数字都成功
- 对字符串使用单引号或双引号
- 在 Javascript 中拆分包含单次出现(不是两次)分隔符的字符串
- 使用 javascript 中的表单元素构建一个 xml 字符串
- 如何在javasciipt中添加单引号到字符串,该字符串将在nodeJS中作为mysql插入命令执行
- 一次单击一下即可在句子中显示大字符串
- 为什么不是'当试图使用Ajax和外部PHP文件发布内容时,我的表单工作不正常,该文件接收两个字符串
- 为什么只有当我在字符串周围输入引号时,表单输入的isNaN才为true
- 使用Javascript DOMParser将单字符串HTML格式化为多行选项卡HTML
- 无法从JSON字符串实例化类型的值;没有单字符串构造函数/工厂方法
- 使用Javascript为变量内部的值添加单字符串引号