思路:
1、使用流将文件读到内存
2、转化为字符数组,将标点符号通过ASCII码分割并添加到List集合
3、使用HashMap对List集合进行元素出现频率的统计
public static void main(String[] args) {
//String sourceStr = "I am a A programer,programer !";
String sourceStr = Utils.getFileText("C:\\Users\\Sony\\Desktop\\英文文章.txt");
List<String> resList = Utils.getElements(sourceStr);
Map<String,Integer> resMap = Utils.countFrequence(resList);
System.out.println("resMap.size():"+resMap.size());
for(Map.Entry entry : resMap.entrySet()){
System.out.println(entry.getKey()+":" + entry.getValue());
}
}
/**
* Description 从文件中读取内容
* @author falcon
* @date 2019-05-11 22:59:30
* @param path
* @return java.lang.String
*/
public static String getFileText (@NotNull String path){
if (StringUtils.isBlank(path)) return "";
String result = "";
try (InputStream is =new FileInputStream(path) ) {
//第一种读取方式:适用于大文件,在这里我们用牛刀来杀这只鸡
Scanner scan = new Scanner (is);
while(scan.hasNextLine()){
result += scan.nextLine();
}
scan.close();
//第二种读取方式:适用于小文件
/* byte[] b = new byte[is.available()];
System.out.println("文件 " + path + " 大小:" + is.available() + "btye" );
is.read(b);
result = new String(b);*/
}catch (IOException e){
e.printStackTrace();
}
return result;
}
/**
* Description 将含有任意中英文标点符号的字符串以任何标点分割并返回List集合
* @author falcon
* @date 2019-05-12 11:56:02
* @param outStr
* @return java.util.List<java.lang.String>
*/
public static List<String> getElements(String outStr){
if (StringUtils.isBlank(outStr)) return null;
List<String> rs = new ArrayList<String>();
char[] chars = outStr.replace("\r\n"," ").toCharArray();
int k = 0 ;
boolean flag = false;
for(int i =0 ; i< chars.length ; i++){
char c = chars[i];
if(Integer.valueOf(c) != null && ( (Integer.valueOf(c)>=32 && Integer.valueOf(c)<=47)
|| (Integer.valueOf(c)>=58 && Integer.valueOf(c)<=64)) ){
String niceStr = StringUtils.trim(outStr.substring(flag ? k+1 : k, i));
if(StringUtils.isNotBlank(niceStr)){
rs.add(niceStr);
}
k = i;
flag = true;
}
}
String lastChar = outStr.substring(flag ? k+1 : k, chars.length);
if(StringUtils.isNotBlank(lastChar)){
rs.add(lastChar);
}
return rs;
}
/**
* Description 统计List集合中元素出现的频率
* @author falcon
* @date 2019-05-10 13:14:36
* @param sourceList
* @return java.util.Map<java.lang.String,java.lang.Integer>
*/
public static Map<String,Integer> countFrequence(List<String> sourceList) {
if (sourceList == null || sourceList.size() == 0) return null;
Map<String,Integer> map = new HashMap<>();
for (int i = 0; i < sourceList.size(); i++) {
Integer count = map.get(sourceList.get(i));
map.put(sourceList.get(i), (count == null) ? 1 : count+1);
}
return map;
}
StringUtils类(也可以不用它):
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class StringUtils {
/**
* Description 检查字符串是否是空白:<code>null</code>、空字符串<code>""</code>或只有空白字符。
* <pre>
* StringUtil.isBlank(null) = true
* StringUtil.isBlank("") = true
* StringUtil.isBlank(" ") = true
* StringUtil.isBlank("bob") = false
* StringUtil.isBlank(" bob ") = false
* </pre>
* @author falcon
* @date 2019-05-12 12:03:59
* @param outStr
* @return boolean
*/
public static boolean isBlank(String outStr){
int length;
if ((outStr == null) || ((length = outStr.length()) == 0)) {
return true;
}
for (int i = 0; i < length; i++) {
if (!Character.isWhitespace(outStr.charAt(i))) {
return false;
}
}
return true;
}
/**
* Description
* @author falcon
* @date 2019-05-12 12:04:04
* @param outStr
* @return boolean
*/
public static boolean isNotBlank(String outStr){
return !isBlank(outStr);
}
/**
* Description
* @author falcon
* @date 2019-05-12 12:06:50
* @param outStr
* @return java.lang.String
*/
public static String trim(String outStr){
/***去除尾部
* excel特殊字符插入 全角空格 \u2003
* 输入法输入 全角空格 \u3000
* 半角空格 \u2002
* 1/4全角空格 \u2005
* 不间断空格 \ufffd \u00a0
* 零宽度空格 \u200c
* 零宽度非断开空格 \u200d **/
Pattern patternRight = Pattern.compile("[\ufffd\u2003\u2002\u2005\u200c\u200d\u00a0\u3000]+$");
return outStr==null ? "" : patternRight.matcher(outStr).replaceAll("").trim();
}
/**
* Description
* @author falcon
* @date 2019-05-12 12:31:56
* @param outStr
* @return boolean
*/
public static boolean isContainChinese(String outStr){
Pattern p = Pattern.compile("[\u4e00-\u9fa5]");
Matcher m = p.matcher(outStr);
return m.find();
}
}