词频统计小程序-WordCount.exe
01
—
先看效果
双击运行,下拉框选择源文件来源,支持本地和网络资源,如图:
本地源文件
网络源文件
02
—
主要代码
1.pom文件
<dependencies>
<!-- 分词器 -->
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
<!-- 单元测试 -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.18.1</version>
<configuration>
<skipTests>true</skipTests>
</configuration>
</plugin>
<!--打包插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.4.1</version>
<configuration>
<!-- get all project dependencies -->
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<!-- MainClass in mainfest make a executable jar -->
<archive>
<manifest>
<addClasspath>true</addClasspath>
<mainClass>cn.dintalk.service.WordCount</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<!-- bind to the packaging phase -->
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
2.WebUtils
/**
* @author Mr.song
* @date 2019/10/13 9:26
*/
public class WebUtils {
/**
* 根据url和参数发送get请求
*
* @param url
* @param param
* @return 返回网页内容
*/
public static String sendGet(String url, String param) {
String result = "";
if (param != null) {
url = url + "?" + param;
}
try {
URL realUrl = new URL(url);
// 打开和URL之间的连接
HttpURLConnection conn = getHttpURLConnection(realUrl);
result = getResponse(conn);
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
//根据url 获取连接
private static HttpURLConnection getHttpURLConnection(URL realUrl) {
StringBuilder sb = new StringBuilder();
sb.append("Mozilla/5.0 (Windows NT 10.0; Win64; x64)");
sb.append(" AppleWrbKit/537.36(KHTML, like Gecko)");
sb.append(" Chrome/72.0.3626.119 Safari/537.36");
HttpURLConnection conn = null;
try {
// 打开和URL之间的连接
conn = (HttpURLConnection) realUrl.openConnection();
// 设置通用的请求属性
conn.setRequestProperty("accept", "*/*");
conn.setRequestProperty("connection", "Keep-Alive");
conn.setRequestProperty("user-agent", sb.toString());
} catch (IOException e) {
e.printStackTrace();
}
return conn;
}
// 根据url连接获取响应
private static String getResponse(HttpURLConnection conn) {
// 读取URL的响应
String result = "";
try (InputStream is = conn.getInputStream();
InputStreamReader isr = new InputStreamReader(is, "utf-8");
BufferedReader in = new BufferedReader(isr)) {
String line;
while ((line = in.readLine()) != null) {
result += "\n" + line;
}
} catch (Exception e) {
System.out.println("Err:getResponse()");
e.printStackTrace();
} finally {
conn.disconnect();
}
// System.out.println("getResponse():" + result.length());
return result;
}
/**
* 解析网页为文本
*
* @param html
* @return
*/
public static String parseHtmlToText(String html) {
Document document = Jsoup.parse(html);
return document.text();
}
}
3.IKSUtils
/**
* @author Mr.song
* @date 2019/10/10 21:12
*/
public class IKSUtils {
/**
* 对文本进行分词
* @param text
* @return
* @throws Exception
*/
public static List<String> getStringList(String text) throws Exception{
//独立Lucene实现
StringReader re = new StringReader(text);
IKSegmenter ik = new IKSegmenter(re, true);
Lexeme lex;
List<String> s = new ArrayList<>();
while ((lex = ik.next()) != null) {
s.add(lex.getLexemeText());
}
return s;
}
/**
* 统计词频
* @param wordList
* @return
*/
public static Map<String,Integer> wordCount(List<String> wordList){
if (wordList == null) return null;
Map<String,Integer> result = new HashMap<>();
for (String s : wordList) {
Integer count = result.get(s);
if (count == null){
result.put(s,1);
}else {
result.put(s,++count);
}
}
//按照次数排序
result = result
.entrySet()
.stream()
.sorted(Collections.reverseOrder(Map.Entry.comparingByValue()))
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2,
LinkedHashMap::new));
return result;
}
}
03
—
相关链接
源码地址:https://github.com/MrSonghui/wordCount
打包.exe :https://www.cnblogs.com/xiaoMzjm/p/3879766.html
赞 (0)