Your Ad Here
首页 | 编程语言 | 网站建设 | 游戏天堂 | 冲浪宝典 | 网络安全 | 操作系统 | 软件时空 | 硬件指南 | 病毒相关 | IT 认证
软讯网络 > 软件时空 > 软件相关 > 相关度计算2
【标  题】:相关度计算2
【关键字】:
【来  源】:http://www.cublog.cn/u/4614/showart.php?id=107446

相关度计算2

Your Ad Here 采用优化后的程序:

import math
import time

len_post = {}

def read_data(filename):
global len_post
data = open(filename, 'r')
post = {} # post[pid] = [uid_1, uid_2, ..., uid_m]
user = {} # user[uid] = [pid_1, pid_2, ..., pid_m]
alllines = data.readlines()
for line in alllines:
pid, uid = line.split()
pid = int(pid)
uid = int(uid)

if not post.has_key(pid): # post 2 user table; only record the user set
post[pid] = {}
if post[pid].has_key(uid):
post[pid][uid] += 1
else:
post[pid][uid] = 1

if len_post.has_key(pid):
len_post[pid] += 1
else:
len_post[pid] = 1

if not user.has_key(uid): # user 2 post table; record post set and frequency
user[uid] = {}
if user[uid].has_key(pid):
user[uid][pid] += 1
else:
user[uid][pid] = 1
len_data = len(alllines)
data.close()
return post, user, len_data

def find_sim_post(post, user):
post_ids = post.keys()
id_map = {} # to find the order of a post quickly
global len_post
for i in range(len(post_ids)):
v = post_ids[i]
id_map[v] = i
for p in post:
res = []
pid = id_map[p]
#print pid, # print process
common_user = post[p] # all users who used post_p
sim_post = {} # posts which have at least one user in common_user; python2.4 required
for u in common_user:
for sp in user[u]: # posts of user_u
mco = min(user[u][p], user[u][sp]) # coocurrency of two posts used by two users
if sim_post.has_key(sp):
sim_post[sp] += mco # cooccurrence of two posts used by all users
else:
sim_post[sp] = mco
sim_post.pop(p)
for sp in sim_post:
co = sim_post[sp]/math.sqrt(len_post[p]*len_post[sp]) # because sqrt(len_post[p]) is identical
res.append((co, sp))
res.sort() # here we can use least-N algorithm to optimize if only N most similar items needed
res.reverse()
print '%d: %s' % (p, [k for co, k in res]) # if you want to only print top 10 items, use res[:10] instead of res
return res


if __name__ == "__main__":
post, user, num_lines= read_data('pw_posts.csv')
print "message:", num_lines, "post:", len(post), "user:", len(user) #214348, 20310, 3844

begin = time.time()
res1 = find_sim_post(post, user)
end = time.time()
print "finding similar posts is done", end-begin

经过翻译的 Lisp程序:

(defun split-by-colon (string)
(loop for i = 0 then (1+ j)
as j = (position #\; string :start i)
collect (subseq string i j)
while j))

(defun read-src ()
(let ((in (open "myprog/pw_posts.csv" :if-does-not-exist nil))
(data '()))
(when in
(loop for line = (read-line in nil)
while line do (push (split-by-colon line) data))
(close in))
data))

(defun get-dic (data)
(let ((post (make-hash-table :test 'equal))
(user (make-hash-table :test 'equal)))
(dolist (pair data)
(progn
(if (gethash (car pair) post)
(push (second pair) (gethash (car pair) post))
(setf (gethash (car pair) post) (cdr pair))))
(if (gethash (second pair) user)
(push (car pair) (gethash (second pair) user))
(setf (gethash (second pair) user) (list (car pair)))))
(list post user)))


(defvar *DATA* nil)
(defvar *post* nil)
(defvar *user* nil)

(defun load-data ()
(let ((d (get-dic (read-src))))
(format t "User count: ~A~%Post count: ~A~%" (hash-table-count (car d))
(hash-table-count (second d)))
(setq *DATA* d)
(setq *post* (car d))
(setq *user* (second d)))
t)


(defun cooccurrence (pref1 pref2)
(length (intersection
(remove-duplicates pref1)
(remove-duplicates pref2) :test 'equal)))

(defun sort-it (list)
(sort list #'(lambda (x y)
(> (second x)(second y)))))

(defun proc-res-list (list)
(let ((res (subseq (sort-it list) 1 11)))
(format t " ~A~%" res)
res))


(defun all-sim (data)
(let ((res (make-hash-table :test 'equal))
(i 0))
(loop for k being the hash-keys in data using (hash-value v)
do (progn
(setq i (+ i 1))
(format t "~A- ~A : " i k)
(setf (gethash k res) '())
(loop for k2 being the hash-keys in data using (hash-value v2)
do (push (list k2 (cooccurrence v v2)) (gethash k res)))
(setf (gethash k res) (proc-res-list (gethash k res)))))
res))


(defun write-res (data filename)
(let ((out (open filename :direction :output :if-exists :supersede)))
(when out
(loop for k being the hash-key in data using (hash-value v)
do (progn
(write-string k out)
(write-string ":" out)
(dolist (p v) (progn
(write-string (car p) out)
(write-string " " out)))
(write-line "" out)))
(close out))))

(defun calc ()
(progn
(load-data)
(write-res (all-sim *post*) "post_res.dat")
(write-res (all-sim *user*) "user_res.dat")))


python的程序执行时间为10分种,而Lisp的需要40分钟。.
肯定什么地方搞错了,应该还可以继续优化的。

优化程序,提高效率:【上一篇】
Ajax——Simple XmlHttp Example:【下一篇】
【相关文章】
没有相关文章
【随机文章】
  • C#怎么使得窗体在最下面和最上面切不会因win+d而最小化
  • ORACLE9I FOR AIX 5L 的移动
  • Visual Studio.NET FAQ(中文版)
  • 单元测试究竟是测试什么?
  • Windows Live? OneCare?中文版现身
  • MSPlus ToolBar&Menu WebControl FreeVersion 1.1.0830 发布拉
  • sprintf你知道多少
  • 天堂2 北方V集体杀龙记
  • 用native2ascii工具解决properties文件乱码问题
  • redhat9下occi访问oracle9i
  • 【相关评论】
    没有相关评论
    【发表评论】
    姓名:
    邮件:
    随机码*
    评论*
          
    |  首 页  |  版权声明  |  联系我们   |  网站地图  |
    CopyRight © 2004-2007 bbb软讯网络 All Rigths Reserved.