文本挖掘实战——如何找出新词思路及代码实现

本帖最后由 fc013 于 2016-10-15 20:17 编辑

问题导读：

1.怎样自动的从文本中找出新的词?

2.怎样在处理数据时自动分割大文件?

3.怎样利用JAVA进行抽词?

开始之前，先看一下从人人网中发现的90后用户爱用的词

是不是很好玩，哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词，这样就知道现在的年轻人喜欢什么了（对于博主这种上了年纪的人来说，真的是很有用，呜呜）

项目结构

当然，text.dat和common.dic这两个文件你可以随意替换，注意text.dat中的数据一定要够份量，否则没啥效果

原理么，看下Matrix67大牛的文章你就懂了

互联网时代的社会语言学：基于SNS的文本数据挖掘

处理数据下载

https://pan.baidu.com/s/1jIoE86q

下边开始上代码

common

这个里边包含以下几个类，主要是定义数据结构

CountMap.java

定义一个计数Map来进行数据操作和持久化

[mw_shl_code=java,true]package grid.common;

import java.io.Serializable;
import java.util.HashMap;

public class CountMap<T> extends HashMap<T, Integer> implements Serializable {

private static final long serialVersionUID = 6097963798841161750L;

public void increase(T t) {//添加元素
      Integer count = get(t);
      if (null == count) {
         put(t, 1);
      } else {
         put(t, ++count);
      }
}

public int count() { //计数
      int count = 0;
      for (T t : keySet()) {
         count += get(t);
      }
      return count;
}

public int get(char c) {
      Integer count = super.get(c);
      return null == count ? 0 : count;
}
}[/mw_shl_code]

Node.java

定义语法树的节点

[mw_shl_code=java,true]package grid.common;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class Node<T> {
protected List<Node<T>> children;

protected Node<T> parent;

protected T value;

Node(T value) {
      this.value = value;
}

public Node<T> add(T value) {
      if (null == children) {
         children = new ArrayList<Node<T>>();
      }
      Node<T> child = new Node<T>(value);
      child.setParent(this);
      children.add(child);
      return child;
}

public T getValue() {
      return value;
}

public Node<T> getParent() {
      return parent;
}

public void setParent(Node<T> parent) {
      this.parent = parent;
}
      //递归遍历孩子节点
private void recurseChildren(List<Node<T>> list, Node<T> parent) {
      if (null == parent.children) {
         list.add(parent);
      } else {
         for (Node<T> node : parent.children) {
            recurseChildren(list, node);
         }
      }
}

public List<Node<T>> getLeaves() {
      List<Node<T>> list = new ArrayList<Node<T>>();
      recurseChildren(list, this);
      return list;

}

public List<T> getBranchPath() {
      List<T> list = new ArrayList<T>();
      Node<T> node = this;
      do {
         list.add(node.getValue());
         node = node.parent;
      } while (null != node && !(node instanceof Tree<?>));
      Collections.reverse(list);
      return list;
}

private void append(StringBuilder builder, int deep, Node<T> node) {
      for (int i = 0; i < deep; i++) {
         builder.append("  ");
      }
      builder.append("|--");
      builder.append(node.getValue());
      builder.append("\n");
      if (null != node.children) {
         for (Node<T> child : node.children) {
            append(builder, deep + 1, child);
         }
      }
}

public String dump() {
      StringBuilder builder = new StringBuilder();
      append(builder, 0, this);
      return builder.toString();
}

public String toString() {
      return value.toString();
}
}[/mw_shl_code]

TextDatReader.java

读取处理数据

[mw_shl_code=java,true]package grid.common;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;

public class TextDatReader {
// public static String read(String path) throws IOException {
//       File file = new File(path);
//       FileReader reader = new FileReader(file);
//       char buffer[] = new char[(int) file.length()];
//       reader.read(buffer);
//       return new String(buffer);
//    }
@SuppressWarnings("resource")
public static String read(String path) throws IOException {
      File file = new File(path);
      FileInputStream s = new FileInputStream(file);
      // 以utf8格式打开文件
//    FileReader fr = new FileReader(file);
      BufferedReader reader = new BufferedReader(new InputStreamReader(s,
            "utf8"));
      char buffer[] = new char[(int) file.length()];
      reader.read(buffer);
      return new String(buffer);
}

// 判断是否存在dat文件夹，没有的话就创建
public static void createDir() {
      File file = new File("./dat");
      if (!file.exists() && !file.isDirectory()) {
         file.mkdir();
      }
}

public static final String SUFFIX = ".dat"; // 分割后的文件名后缀

// 将指定的文件按着给定的文件的字节数进行分割文件，其中name指的是需要进行分割的文件名，size指的是指定的小文件的大小
public static void divide(String name, long size) throws Exception {
      File file = new File(name);
      if (!file.exists() || (!file.isFile())) {
         throw new Exception("指定文件不存在！");
      }
      // 取得文件的大小
      long fileLength = file.length();
      if (size <= 0) {
         size = fileLength / 2;
      }
      // 取得被分割后的小文件的数目
      int num = (fileLength % size != 0) ? (int) (fileLength / size + 1)
            : (int) (fileLength / size);
      // 存放被分割后的小文件名
      String[] fileNames = new String[num];
      // 输入文件流，即被分割的文件
      FileInputStream in = new FileInputStream(file);
      // 读输入文件流的开始和结束下标
      long end = 0;
      int begin = 0;
      createDir();
      // 根据要分割的数目输出文件
      for (int i = 1; i <= num; i++) {
         // 对于前num - 1个小文件，大小都为指定的size
         File outFile = new File("./dat", "text" + i + SUFFIX);
         // 构建小文件的输出流
         FileOutputStream out = new FileOutputStream(outFile);
         // 将结束下标后移size
         end += size;
         end = (end > fileLength) ? fileLength : end;
         // 从输入流中读取字节存储到输出流中
         for (; begin < end; begin++) {
            out.write(in.read());
         }
         out.close();
         fileNames = outFile.getAbsolutePath();
         System.out.println("第"+i+"个子文件生成……");

      }
      in.close();
}

// public static void main(final String[] args) throws Exception {
// String name = "text.dat";
// long size = 1024 * 1024 * 4;// 1K=1024b(字节),切割后每个文件为4M
// TextDatReader.divide(name, size);
//
// }

}[/mw_shl_code]

TextUtils.java

用来做文本处理，如判断是否为空、匹配字符等

[mw_shl_code=java,true]package grid.common;

public class TextUtils {

public static boolean isCnLetter(char c) {//判断是否为中文字符
      return c >= 0x4E00 && c <= 0x9FCB;
}

public static boolean isNumeric(char c) {//判断是否为数字
         return c >= '0' && c <= '9';
}

public static boolean isEnLetter(char c) {//判断是否为英文字母
      return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}
      //字符串匹配
public static boolean match(String src, int off, String dest) {
      int len = dest.length();
      int srcLen = src.length();
      for (int i = 0; i < len; i++) {
         if (srcLen <= off + i) {
            return false;
         }
         if (dest.charAt(i) != src.charAt(off + i)) {
            return false;
         }
      }
      return true;
}
   //判断是否为空
public static boolean isBlank(String str) {
      return null == str || str.isEmpty() || str.trim().isEmpty();
}
}[/mw_shl_code]

Tree.java

语法树

[mw_shl_code=java,true]package grid.common;

public class Tree<T> extends Node<T> {

public Tree(T value) {
super(value);
}

}[/mw_shl_code]

dic

里边包含CnDictionary类

CnDictionary.java

词典处理

[mw_shl_code=java,true]package grid.text.dic;

import grid.common.CountMap;
import grid.common.TextDatReader;
import grid.common.TextUtils;

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;

public class CnDictionary {

private final String COMMON_WORD_DIC_PATH = "common.dic";

/**
   * This text data is for character statistic. Change to your own if you
   * like.
   */
private final String COMMON_LETTER_RESOURCE_PATH = "text.dat";

private Set<String> dictionary = new HashSet<String>();

private CountMap<Character> letterCountMap = new CountMap<Character>();

private int totalLetterCount;

private static CnDictionary instance;
//单例模式
public static CnDictionary Instance() {
      if (null == instance) {
         try {
            instance = new CnDictionary();
         } catch (IOException e) {
            e.printStackTrace();
         }
      }
      return instance;
}

private CnDictionary() throws IOException {
      initWordDic();
      initLetterCountMap();
}

private void initLetterCountMap() throws IOException {
      String letterResource = TextDatReader.read(COMMON_LETTER_RESOURCE_PATH);//读取语料数据 text.dat
      final int len = letterResource.length();
      char c;
      for (int i = 0; i < len; i++) {
         c = letterResource.charAt(i);
         if (TextUtils.isCnLetter(c)) {
            letterCountMap.increase(c);
         }
      }
      totalLetterCount = letterCountMap.count();

}

private void initWordDic() throws IOException {

      String bytes = TextDatReader.read(COMMON_WORD_DIC_PATH);//读取词典commondic
      final int len = bytes.length();
      String s = "";
      char c;
      for (int i = 0; i < len; i++) {
         c = bytes.charAt(i);

         if ('\n' == c || '\r' == c || 0 == c) {
            if (!TextUtils.isBlank(s)) {
                  dictionary.add(s.trim());
            }
            s = "";
         } else {
            s += c;
         }
         if (0 == c) {
            break;
         }
      }
}

public boolean contains(String word) {
      return dictionary.contains(word);
}

public double rate(char c) {
      return (double) letterCountMap.get(c) / totalLetterCount;
}

public int size() {
      return dictionary.size();
}
}[/mw_shl_code]

evolution

EntropyJudger.java

计算熵值

[mw_shl_code=java,true]package grid.text.evolution;

import grid.common.CountMap;
import grid.common.TextUtils;
import grid.text.index.Pos;
import grid.text.index.TextIndexer;

public class EntropyJudger {

private TextIndexer indexer;

/**
   * A word least appeared count
   */
private static int LEAST_COUNT_THRESHOLD = 5;

/**
   * Threshold for solid rate calculated by word appeared count and every
   * single letter.
   *
   * The smaller this values is, more new words you will get, but with less
   * accuracy. The greater this value is, less new words you will get, but
   * with high accuracy.
   */
private static double SOLID_RATE_THRESHOLD = 0.018;

/**
   * Threshold for entropy value calculated by candidate word prefix character
   * count and suffix character count
   *
   * The smaller this values is, more new words you will get, but with less
   * accuracy. The greater this value is, less new words you will get, but
   * with high accuracy.
   */
private static double ENTROPY_THRESHOL = 1.92;

public EntropyJudger(TextIndexer indexer) {
      this.indexer = indexer;
}

public boolean judge(String candidate) {
      double solidRate = getSolidRate(candidate);

      if (solidRate < SOLID_RATE_THRESHOLD) {
         return false;
      }

      double entropy = getEntropy(candidate);

      if (entropy < ENTROPY_THRESHOL) {
         return false;
      }
      return true;
}

private double getEntropy(String candidate) {
      Pos pos = new Pos(candidate);
      CountMap<Character> frontCountMap = new CountMap<Character>();
      CountMap<Character> backCountMap = new CountMap<Character>();
      final int candidateLen = candidate.length();
      int off = 0;
      char c;
      double rate, frontEntropy = 0, backEntropy = 0;

      while (indexer.find(pos).isFound()) {
         off = pos.getPos();

         c = indexer.charAt(off - 1);
         if (TextUtils.isCnLetter(c)) {
            frontCountMap.increase(c);
         }
         c = indexer.charAt(off + candidateLen);
         if (TextUtils.isCnLetter(c)) {
            backCountMap.increase(c);
         }

      }

      for (char key : frontCountMap.keySet()) {
         rate = (double) frontCountMap.get(key) / frontCountMap.count();
         frontEntropy -= rate * Math.log(rate);
      }
      for (char key : backCountMap.keySet()) {
         rate = (double) backCountMap.get(key) / backCountMap.count();
         backEntropy -= rate * Math.log(rate);
      }

      return frontEntropy > backEntropy ? backEntropy : frontEntropy;

}

/**
   * @param candidate
   * @return
   */
public double getSolidRate(String candidate) {

      final int candidateLen = candidate.length();

      if (candidateLen < 2) {
         return 1;
      }

      final int count = indexer.count(candidate);
      double rate = 1;

      if (count < LEAST_COUNT_THRESHOLD) {
         return 0;
      }

      for (int i = 0; i < candidateLen; i++) {
         rate *= (double) count / indexer.count("" + candidate.charAt(i));
      }

      return Math.pow(rate, 1D / candidateLen) * Math.sqrt(candidateLen);
}

public void setIndexer(TextIndexer indexer) {
      this.indexer = indexer;
}

}[/mw_shl_code]

NewWordDiscover.java

抽词程序

[mw_shl_code=java,true]package grid.text.evolution;

import grid.common.TextUtils;
import grid.text.dic.CnDictionary;
import grid.text.index.CnPreviewTextIndexer;
import grid.text.index.TextIndexer;
import grid.text.selector.CnTextSelector;
import grid.text.selector.TextSelector;

import java.util.HashSet;
import java.util.Set;

public class NewWordDiscover {

private CnDictionary dictionary;

/**
   * Minimum word length
   */
private final static int MIN_CANDIDATE_LEN = 2;

/**
   * Maximum word length
   */
private final static int MAX_CANDIDATE_LEN = 6;

private static Set<Character> structuralLetterSet = new HashSet<Character>();

private static char[] structuralLetters = { '我', '你', '您', '他', '她', '谁',
         '哪', '那', '这', '的', '了', '着', '也', '是', '有', '不', '在', '与', '呢',
         '啊', '呀', '吧', '嗯', '哦', '哈', '呐' };

static {
      for (char c : structuralLetters) {
         structuralLetterSet.add(c);
      }
}

public NewWordDiscover() {
      dictionary = CnDictionary.Instance();
}

/**
   * New word discover is based on statistic and entropy, better to sure
   * document size is in 100kb level, or you may get a unsatisfied result.
   *
   * @param document
   * @return
   */
public Set<String> discover(String document) {

      Set<String> set = new HashSet<String>();
      TextIndexer indexer = new CnPreviewTextIndexer(document);
      TextSelector selector = new CnTextSelector(document, MIN_CANDIDATE_LEN,
            MAX_CANDIDATE_LEN);
      EntropyJudger judger = new EntropyJudger(indexer);
      String candidate;
      while (!selector.end()) {
         candidate = selector.next();
         if (TextUtils.isBlank(candidate)) {
            continue;
         }
         if (structuralLetterSet.contains(candidate.charAt(0))
                  || structuralLetterSet.contains(candidate.charAt(candidate
                        .length() - 1))) {
            continue;
         }
         // Replace IF clause with "set.contains(candidate)" if you want to
         // find new word without any dictionary
         if (dictionary.contains(candidate) || set.contains(candidate)) {
            selector.select();
         } else if (judger.judge(candidate)) {
            set.add(candidate);
         }
      }
      return set;
}
}[/mw_shl_code]

index

这几个类用于给词创建索引，方便从词典中找出

CnPreviewTextIndexer.java

[mw_shl_code=java,true]package grid.text.index;

import grid.common.TextUtils;

import java.util.HashMap;
import java.util.Map;
import java.util.Vector;

public class CnPreviewTextIndexer implements TextIndexer {

private final static int CN_LETTER_COUNT = 5021;

private String document;

private Map<Character, Vector<Integer>> posMap;

public CnPreviewTextIndexer(String document) {
      this.document = document;
      init();
}

private void init() {
      final int len = document.length();

      final int supposedMinCount = 1 + (int) Math.log(len / CN_LETTER_COUNT
            + 1);

      char c;

      Vector<Integer> posVector;

      posMap = new HashMap<Character, Vector<Integer>>(CN_LETTER_COUNT);

      for (int i = 0; i < len; i++) {
         c = document.charAt(i);
         if (!TextUtils.isCnLetter(c)) {
            continue;
         }
         posVector = posMap.get(c);
         if (null == posVector) {
            posVector = new Vector<Integer>(supposedMinCount);
            posMap.put(c, posVector);
         }
         posVector.add(i);
      }
}

@Override
public int count(String text) {

      if (TextUtils.isBlank(text)) {
         return 0;
      }

      Vector<Integer> vector = posMap.get(text.charAt(0));

      if (null == vector) {
         return 0;
      }

      if (1 == text.length()) {
         return vector.size();
      }

      final int size = vector.size();
      int count = 0;

      for (int i = 0; i < size; i++) {
         if (TextUtils.match(document, vector.get(i), text)) {
            count++;
         }
      }

      return count;
}

@Override
public Pos find(Pos pos) {
      String text = pos.getTarget();

      pos.setFound(false);

      if (TextUtils.isBlank(text)) {
         return pos;
      }

      Vector<Integer> vector = posMap.get(text.charAt(0));

      if (null == vector) {
         return pos;
      }

      final int arraySize = vector.size();
      final int arrayIndex = pos.arrayIndex + 1;

      for (int i = arrayIndex; i < arraySize; i++) {
         if (TextUtils.match(document, vector.get(i), text)) {
            pos.setFound(true);
            pos.setPos(vector.get(i));
            pos.arrayIndex = i;
            break;
         }
      }

      return pos;
}

@Override
public int len() {
      return document.length();
}

@Override
public String sub(int off, int len) {
      if (off < 0 || off + len >= document.length()) {
         return "";
      }
      return document.substring(off, off + len);
}

@Override
public char charAt(int index) {
      if (index < 0 || index >= document.length()) {
         return 0;
      }
      return document.charAt(index);
}
}[/mw_shl_code]

Pos.java

[mw_shl_code=java,true]package grid.text.index;

public class Pos {
private String target;

/**
   * Pos for current matched full target text
   */
private int pos = -1;

/**
   * Index in position array for current matched full target text
   */
int arrayIndex = -1;

private boolean found = false;

public Pos(String target) {
      this.target = target;
}

public String getTarget() {
      return target;
}

public int getPos() {
      return pos;
}

public boolean isFound() {
      return found;
}

void setPos(int pos) {
      this.pos = pos;
}

void setFound(boolean found) {
      this.found = found;
}
}[/mw_shl_code]

SimpleTextIndexer.java

[mw_shl_code=java,true]package grid.text.index;

public class SimpleTextIndexer implements TextIndexer {

private String document;

public SimpleTextIndexer(String document) {
      this.document = document;
}

@Override
public int count(String text) {
      int off = 0;
      int count = 0;
      final int len = text.length();
      while ((off = document.indexOf(text, off)) > -1) {
         count++;
         off += len;
      }
      return count;
}

@Override
public Pos find(Pos pos) {
      final String text = pos.getTarget();
      final int len = text.length();
      int off = pos.getPos() + len;
      if (pos.getPos() < 0)
         off = 0;

      pos.setFound(false);

      if ((off = document.indexOf(text, off)) > -1) {
         pos.setFound(true);
         pos.setPos(off);
      }
      return pos;
}

@Override
public int len() {
      return document.length();
}

@Override
public String sub(int off, int len) {
      return document.substring(off, off + len);
}

@Override
public char charAt(int index) {
      if (index < 0 || index >= document.length()) {
         return 0;
      }
      return document.charAt(index);
}
}[/mw_shl_code]

TextIndexer.java

[mw_shl_code=java,true]package grid.text.index;

public interface TextIndexer {

/**
   * @param text
   * @return count for specific text
   */
public int count(String text);

/**
   * @param pos
   * @return next position for current pos
   */
public Pos find(Pos pos);

/**
   * @return original document length
   */
public int len();

/**
   * @param off
   * @param len
   * @return the sub string start from <b>off</b> and with a length with
   *       <b>len</b>
   */
public String sub(int off, int len);

/**
   * @param index
   * @return return the character in the specified index
   */
public char charAt(int index);
}[/mw_shl_code]

participle

分词处理，具体看实现

Chunk.java

[mw_shl_code=java,true]package grid.text.participle;

import grid.text.dic.CnDictionary;

import java.util.List;

public class Chunk implements Comparable<Chunk> {

private List<String> list;

private int len = 0;

private double avg = 0;

private double variance = 0;

public Chunk(List<String> list) {
      this.list = list;
      init();
}

private void init() {

      for (String s : list) {
         len += s.length();
      }
      avg = (double) len / list.size();

      for (String s : list) {
         variance += Math.pow(avg - s.length(), 2);
      }
      variance = Math.sqrt(variance);
}

public int getLen() {
      return len;
}

public double getAvg() {
      return avg;
}

public double getVariance() {
      return variance;
}

public String getHead() {
      if (null == list || list.isEmpty()) {
         return "";
      }
      return list.get(0);
}

private int compareDouble(double d1, double d2) {
      if (d1 - d2 < -0.0000001D) {
         return 1;
      } else if (d1 - d2 > 0.0000001D) {
         return -1;
      }
      return 0;
}

@Override
public int compareTo(Chunk o) {

      if (len != o.len) {
         return o.len - len;
      }

      int d = compareDouble(avg, o.avg);
      if (0 != d) {
         return d;
      }

      d = compareDouble(variance, o.variance);
      if (0 != d) {
         return d;
      }

      CnDictionary dictionary = CnDictionary.Instance();

      double rateSrc = 0, rateDest = 0;
      for (String s : list) {
         if (1 == s.length()) {
            rateSrc += dictionary.rate(s.charAt(0));
         }
      }
      for (String s : o.list) {
         if (1 == s.length()) {
            rateDest += dictionary.rate(s.charAt(0));
         }
      }
      return compareDouble(rateSrc, rateDest);
}

public String toString() {
      return list.toString();
}
}[/mw_shl_code]

ChunkStream.java

[mw_shl_code=java,true]package grid.text.participle;

import grid.common.Node;
import grid.common.TextUtils;
import grid.common.Tree;
import grid.text.dic.CnDictionary;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class ChunkStream {

/**
   * Define the max supposed word length
   *
   * You could shorten the value if you don't need too long participle result
   */
private static final int MAX_WORD_LEN = 7;

/**
   * Define the predict level while execute participle.
   *
   * Negligible accuracy will be promoted if you increase this value
   */
private static final int PREDICT_LEVEL = 3;

private static CnDictionary dictionary = CnDictionary.Instance();

public String next(String text, int off) {
      Tree<String> root = new Tree<String>("ROOT");
      recurse(root, off, text, 0);
      List<Node<String>> list = root.getLeaves();
      List<Chunk> chunkList = new ArrayList<Chunk>();
      for (Node<String> node : list) {
         chunkList.add(new Chunk(node.getBranchPath()));
      }
      Collections.sort(chunkList);
      return chunkList.get(0).getHead();

}

private void recurse(Node<String> node, int off, String text,
         int predictDeep) {
      int len = MAX_WORD_LEN + off > text.length() ? text.length() - off
            : MAX_WORD_LEN;

      while (predictDeep < PREDICT_LEVEL) {
         if (len < 1) {
            return;
         }

         String s = text.substring(off, off + len);
         if (len < 2) {
            if (!TextUtils.isCnLetter(text.charAt(off))) {
                  break;
            }
            recurse(node.add(s), off + 1, text, predictDeep + 1);
         } else if (dictionary.contains(s)) {
            recurse(node.add(s), off + s.length(), text, predictDeep + 1);
         }
         len--;
      }
}
}[/mw_shl_code]

MechanicalParticiple.java

[mw_shl_code=java,true]package grid.text.participle;

import grid.common.TextUtils;

import java.util.Vector;

public class MechanicalParticiple {

public Vector<String> partition(String document) {
      Vector<String> vector = new Vector<String>();
      final int docLen = document.length();
      int off = 0;
      char c;
      String seg = "";
      ChunkStream stream = new ChunkStream();

      while (off < docLen) {
         c = document.charAt(off);
         if (TextUtils.isEnLetter(c) || TextUtils.isNumeric(c)) {
            seg += c;
            off++;
         } else if (TextUtils.isCnLetter(c)) {
            if (!TextUtils.isBlank(seg)) {
                  vector.add(seg);
                  seg = "";
            }
            String word = stream.next(document, off);
            if (!TextUtils.isBlank(word)) {
                  vector.add(word);
                  off += word.length();
            }
         } else {
            if (!TextUtils.isBlank(seg)) {
                  vector.add(seg);
                  seg = "";
            }

            /**
               * TODO: Uncomment the "ELSE IF" clause if you would like to
               * reserve punctuations
               */

            // else if (!TextUtils.isBlank("" + c)) { vector.add("" + c); }

            off++;
         }
      }
      if (!TextUtils.isBlank(seg)) {
         vector.add(seg);
      }
      return vector;

}
}[/mw_shl_code]

selector

文本选择器，筛选出可能为新词的词汇

CnTextSelector.java

[mw_shl_code=java,true]package grid.text.selector;

import grid.common.TextUtils;

public class CnTextSelector extends CommonTextSelector {

public CnTextSelector(String document, int minSelectLen, int maxSelectLen) {
      super(document, minSelectLen, maxSelectLen);
}

protected void adjustCurLen() {
      while (pos < docLen && !TextUtils.isCnLetter(document.charAt(pos))) {
         pos++;
      }
      for (int i = 0; i < maxSelectLen && pos + i < docLen; i++) {
         if (!TextUtils.isCnLetter(document.charAt(pos + i))) {
            curLen = i;
            if (curLen < minSelectLen) {
                  pos++;
                  adjustCurLen();
            }
            return;
         }
      }

      curLen = pos + maxSelectLen > docLen ? docLen - pos : maxSelectLen;
}
}[/mw_shl_code]

CommonTextSelector.java

[mw_shl_code=java,true]package grid.text.selector;

public class CommonTextSelector implements TextSelector {

protected String document;

protected int pos = 0;

protected int maxSelectLen = 5;

protected int minSelectLen = 2;

protected int curLen;

protected final int docLen;

public CommonTextSelector(String document, int minSelectLen,
         int maxSelectLen) {
      this.document = document;
      this.minSelectLen = minSelectLen;
      this.maxSelectLen = maxSelectLen;
      docLen = document.length();
      adjustCurLen();
}

public void select() {
      pos += ++curLen;
      adjustCurLen();
}

protected void adjustCurLen() {
      curLen = pos + maxSelectLen > docLen ? docLen - pos : maxSelectLen;
}

public String next() {
      if (curLen < minSelectLen) {
         pos++;
         adjustCurLen();
      }

      if (pos + curLen <= docLen && curLen >= minSelectLen) {
         return document.substring(pos, pos + curLen--);
      } else {
         curLen--;
         // return document.substring(pos, docLen);
         return "";
      }
}

public boolean end() {
      return curLen < minSelectLen && curLen + pos >= docLen - 1;
}

@Override
public int getCurPos() {
      return pos;
}
}[/mw_shl_code]

TextSelector.java

[mw_shl_code=java,true]package grid.text.selector;

public interface TextSelector {
public boolean end();

public void select();

public String next();

public int getCurPos();

}[/mw_shl_code]

测试代码

NewWordDiscoverTest.java

[mw_shl_code=java,true]package grid.test;

import grid.common.TextDatReader;
import grid.text.evolution.NewWordDiscover;
import grid.text.index.CnPreviewTextIndexer;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Scanner;
import java.util.Set;

public class NewWordDiscoverTest {
public static void writefile(String m) {

      try {
         File file = new File("result.txt");
         if (!file.exists()) {
            file.createNewFile();
         }
         FileWriter fileWritter = new FileWriter(file.getName(), true);
         BufferedWriter bufferWritter = new BufferedWriter(fileWritter);
         bufferWritter.write(m);
         bufferWritter.close();

      } catch (IOException e) {
         e.printStackTrace();
      }
}

@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
      // 开始之前，清空result.txt，避免数据重复
      File filere = new File("result.txt");
      filere.delete();

      Scanner scan = new Scanner(System.in);
      System.out.println("请输入您要处理的文件名称:\n");
      String path = scan.next();
      File file = new File(path);
      if (!file.exists() || (!file.isFile())) {
         throw new Exception("指定文件不存在！");
      }
      long maxsize = 1024 * 1024 * 1024;// 1G,超过这个值需要做文件切分
      long size = 1024 * 1024 * 5; // 子文件最大为100M
      long fileLength = file.length();
      if (size <= 0) {
         size = fileLength / 2;
      }
      // 取得被分割后的小文件的数目
      int num = (fileLength % size != 0) ? (int) (fileLength / size + 1)
            : (int) (fileLength / size);
      if (file.length() >= maxsize) {
         System.out.println("文件大小超出1G，是否开始进行文件切割？1:是 0:否\n");

         int t = scan.nextInt();
         if (t == 1) {
            TextDatReader.divide(path, size);
            System.out.println("切割完成\n");
            System.out.println("结果保存在当前目录下的dat文件夹中\n");

         }
         // System.out.println("请输入您要处理的文件序号，例如1代表dat文件架下的text1.dat\n");
         // int m = scans.nextInt();
         for (int m = 1; m <= num; m++) {
            String pathdived = "./dat/text" + m + ".dat";
            System.out.println("开始提取第" + m + "个文件……");
            discovrWord(pathdived);
         }

      } else {
         System.out.println("开始提取文件……");
         discovrWord(path);
      }
}

private static void discovrWord(String path) throws IOException {
      String document = TextDatReader.read(path);
      NewWordDiscover discover = new NewWordDiscover();
      Set<String> words = discover.discover(document);
      CnPreviewTextIndexer ci = new CnPreviewTextIndexer(document);
//    long start = System.currentTimeMillis();
//    System.out.println("耗时: " + (double) document.length()
//             / (System.currentTimeMillis() - start) * 1000);
      System.out.println("新词个数: " + words.size());
      System.out.println("发现的新词:" + "\n");
      for (String newword : words) {
         System.out.println(newword + "," + ci.count(newword) + "\n");// 发现新词后，统计每个新词出现的次数
         writefile(newword + "," + ci.count(newword) + "\n");
      }
}
}[/mw_shl_code]

抽词测试，结果如下

ParticipleTest.java

[mw_shl_code=java,true]package grid.test;

import grid.text.participle.MechanicalParticiple;

import java.util.Vector;

public class ParticipleTest {

private static String document = "我是中国人";

public static void main(String args[]) {
      MechanicalParticiple participle = new MechanicalParticiple();
      Vector<String> vec = participle.partition(document);
      System.out.println(vec);
}
}[/mw_shl_code]

分词测试，结果如下

怎么样，很酷吧，你还可以试着用《天龙八部》数据集玩下，看看主角是不是乔帮主。如果发现了什么新鲜词，请告诉博主，咱也不落后哈！

VIP独享–天龙八部新词，如果想看结果请心里默夸博主一百次

执行以上步骤后再送您一份哈利波特版的

来源：csdn

作者：浙里同心