文本挖掘--python
发布时间:2021-01-17 16:18:47  所属栏目:大数据  来源:网络整理 
            导读:# -*- coding: utf-8 -*- """ Created on Mon Oct 03 11:07:58 2016 @author: liqi """ keep = { 'a' , 'b' , 'c' , 'd' , 'e' , 'f' , 'g' , 'h' , 'i' , 'j' , 'k' , 'l' , 'm' , 'n' , 'o' , 'p' , 'q' , 'r' , 's' , 't' , 'u' , 'v' , 'w' , 'x' , 'y'
                
                
                
            | 
 # -*- coding: utf-8 -*-
""" Created on Mon Oct 03 11:07:58 2016 @author: liqi """
keep = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',' ','-',"'"}
def normalize(s):
   return ''.join(c for c in s.lower() if c in keep)  
def make_freq_dict(s):
    s = normalize(s)
    words = s.split()
    d = {}
    for w in words:
        if w in d:
            d[w] += 1
        else:
            d[w] = 1
    return d
def print_file_stats(fname):
    s = open(fname,'r').read()
    num_chars = len(s)
    num_lines = s.count('n')
    d = make_freq_dict(s)
    num_words = sum(d[w] for w in d)
    lst = [(d[w],w) for w in d]
    lst.sort()
    lst.reverse()
    print("The file '%s' has:"% fname)
    print(" %s characters" % num_chars)
    print(" %s lines" % num_lines)
    print(" %s words" % num_words)
    print("nThe top 10 most frequant words are:")
    i = 1
    for count,word in lst[:20]:
        print('%2s. %4s %s' %(i,count,word))
        i += 1
def main():
    print_file_stats('bill.txt')
if __name__ == '__main__':
    main()输出结果The file 'bill.txt' has: 34426 characters 94 lines 6215 words The top 10 most frequant words are: 1. 320 the 2. 260 i 3. 202 and 4. 183 to 5. 148 of 6. 147 a 7. 131 was 8. 124 in 9. 81 my 10. 64 he 11. 61 for 12. 57 had 13. 56 that 14. 51 it 15. 50 with 16. 50 me 17. 48 his 18. 47 on 19. 35 when 20. 35 but (编辑:邯郸站长网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! | 


