优雅的统计英文文章中单词出现的次数

最新推荐文章于 2024-09-15 10:43:39 发布

爱码僧

最新推荐文章于 2024-09-15 10:43:39 发布

阅读量963

点赞数

CC 4.0 BY-SA版权

分类专栏： Java

本文链接：https://blog.csdn.net/qq_33666602/article/details/90217478

Java 专栏收录该内容

28 篇文章

订阅专栏

本文介绍了一种从文件中读取文本，并对其进行标点分割、统计单词频率的方法。使用Java实现，包括从文件读取内容、按标点符号分割字符串、统计各单词出现频率的功能。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

思路：

1、使用流将文件读到内存

2、转化为字符数组，将标点符号通过ASCII码分割并添加到List集合

3、使用HashMap对List集合进行元素出现频率的统计

    public static void main(String[] args) {

        //String sourceStr = "I am a A programer,programer !";
        String sourceStr = Utils.getFileText("C:\\Users\\Sony\\Desktop\\英文文章.txt");
        List<String> resList = Utils.getElements(sourceStr);
        Map<String,Integer> resMap  = Utils.countFrequence(resList);
        System.out.println("resMap.size():"+resMap.size());
        for(Map.Entry entry : resMap.entrySet()){
            System.out.println(entry.getKey()+":" + entry.getValue());
        }
    }

/**
     * Description 从文件中读取内容
     * @author falcon
     * @date   2019-05-11 22:59:30
     * @param  path
     * @return java.lang.String
     */
    public static String getFileText (@NotNull String path){
        if (StringUtils.isBlank(path)) return "";
        String result = "";
        try (InputStream is =new FileInputStream(path) ) {

            //第一种读取方式:适用于大文件，在这里我们用牛刀来杀这只鸡
            Scanner scan = new Scanner (is);
            while(scan.hasNextLine()){
                result += scan.nextLine();
            }
            scan.close();

            //第二种读取方式：适用于小文件
           /* byte[] b = new byte[is.available()];
            System.out.println("文件 " + path + " 大小：" + is.available() + "btye" );
            is.read(b);
            result = new String(b);*/

        }catch (IOException e){
            e.printStackTrace();
        }
        return result;
    }

/**
     * Description 将含有任意中英文标点符号的字符串以任何标点分割并返回List集合
     * @author falcon
     * @date   2019-05-12 11:56:02
     * @param  outStr
     * @return java.util.List<java.lang.String>
     */
    public static List<String> getElements(String outStr){
        if (StringUtils.isBlank(outStr)) return null;
        List<String> rs = new ArrayList<String>();
        char[] chars = outStr.replace("\r\n"," ").toCharArray();
        int k = 0 ;
        boolean flag = false;
        for(int i =0 ; i< chars.length ; i++){
            char c = chars[i];
            if(Integer.valueOf(c) != null && ( (Integer.valueOf(c)>=32 && Integer.valueOf(c)<=47)
                                                || (Integer.valueOf(c)>=58 && Integer.valueOf(c)<=64)) ){
                String niceStr = StringUtils.trim(outStr.substring(flag ? k+1 : k, i));
                if(StringUtils.isNotBlank(niceStr)){
                    rs.add(niceStr);
                }
                k = i;
                flag = true;
            }
        }
        String lastChar = outStr.substring(flag ? k+1 : k, chars.length);
        if(StringUtils.isNotBlank(lastChar)){
            rs.add(lastChar);
        }
        return rs;
    }

    /**
     * Description  统计List集合中元素出现的频率
     * @author falcon
     * @date   2019-05-10 13:14:36
     * @param  sourceList
     * @return java.util.Map<java.lang.String,java.lang.Integer>
     */
    public static Map<String,Integer> countFrequence(List<String> sourceList) {
        if (sourceList == null || sourceList.size() == 0) return  null;
        Map<String,Integer> map = new HashMap<>();
        for (int i = 0; i < sourceList.size(); i++) {
            Integer count = map.get(sourceList.get(i));
            map.put(sourceList.get(i), (count == null) ? 1 : count+1);
        }
        return map;
    }

StringUtils类（也可以不用它）：

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class StringUtils {
    /**
     * Description 检查字符串是否是空白：<code>null</code>、空字符串<code>""</code>或只有空白字符。
     *  <pre>
     * StringUtil.isBlank(null)      = true
     * StringUtil.isBlank("")        = true
     * StringUtil.isBlank(" ")       = true
     * StringUtil.isBlank("bob")     = false
     * StringUtil.isBlank("  bob  ") = false
     *  </pre>
     * @author falcon
     * @date   2019-05-12 12:03:59
     * @param  outStr
     * @return boolean
     */
    public static boolean isBlank(String outStr){
        int length;

        if ((outStr == null) || ((length = outStr.length()) == 0)) {
            return true;
        }
        for (int i = 0; i < length; i++) {
            if (!Character.isWhitespace(outStr.charAt(i))) {
                return false;
            }
        }
        return true;
    }

    /**
     * Description
     * @author falcon
     * @date   2019-05-12 12:04:04
     * @param  outStr
     * @return boolean
     */
    public static boolean isNotBlank(String outStr){
        return !isBlank(outStr);
    }

    /**
     * Description
     * @author falcon
     * @date   2019-05-12 12:06:50
     * @param  outStr
     * @return java.lang.String
     */
    public static String trim(String outStr){
        /***去除尾部
         * excel特殊字符插入 全角空格 \u2003
         * 输入法输入 全角空格 \u3000
         * 半角空格 \u2002
         * 1/4全角空格 \u2005
         * 不间断空格 \ufffd   \u00a0
         * 零宽度空格 \u200c
         * 零宽度非断开空格 \u200d **/
        Pattern patternRight = Pattern.compile("[\ufffd\u2003\u2002\u2005\u200c\u200d\u00a0\u3000]+$");
        return outStr==null ? "" : patternRight.matcher(outStr).replaceAll("").trim();
    }

    /**
     * Description
     * @author falcon
     * @date   2019-05-12 12:31:56
     * @param  outStr
     * @return boolean
     */
    public static boolean isContainChinese(String outStr){
        Pattern p = Pattern.compile("[\u4e00-\u9fa5]");
        Matcher m = p.matcher(outStr);
        return m.find();
    }
}