
采用优化后的程序:
import math
import time
len_post = {}
def read_data(filename):
global len_post
data = open(filename, 'r')
post = {} # post[pid] = [uid_1, uid_2, ..., uid_m]
user = {} # user[uid] = [pid_1, pid_2, ..., pid_m]
alllines = data.readlines()
for line in alllines:
pid, uid = line.split()
pid = int(pid)
uid = int(uid)
if not post.has_key(pid): # post 2 user table; only record the user set
post[pid] = {}
if post[pid].has_key(uid):
post[pid][uid] += 1
else:
post[pid][uid] = 1
if len_post.has_key(pid):
len_post[pid] += 1
else:
len_post[pid] = 1
if not user.has_key(uid): # user 2 post table; record post set and frequency
user[uid] = {}
if user[uid].has_key(pid):
user[uid][pid] += 1
else:
user[uid][pid] = 1
len_data = len(alllines)
data.close()
return post, user, len_data
def find_sim_post(post, user):
post_ids = post.keys()
id_map = {} # to find the order of a post quickly
global len_post
for i in range(len(post_ids)):
v = post_ids[i]
id_map[v] = i
for p in post:
res = []
pid = id_map[p]
#print pid, # print process
common_user = post[p] # all users who used post_p
sim_post = {} # posts which have at least one user in common_user; python2.4 required
for u in common_user:
for sp in user[u]: # posts of user_u
mco = min(user[u][p], user[u][sp]) # coocurrency of two posts used by two users
if sim_post.has_key(sp):
sim_post[sp] += mco # cooccurrence of two posts used by all users
else:
sim_post[sp] = mco
sim_post.pop(p)
for sp in sim_post:
co = sim_post[sp]/math.sqrt(len_post[p]*len_post[sp]) # because sqrt(len_post[p]) is identical
res.append((co, sp))
res.sort() # here we can use least-N algorithm to optimize if only N most similar items needed
res.reverse()
print '%d: %s' % (p, [k for co, k in res]) # if you want to only print top 10 items, use res[:10] instead of res
return res
if __name__ == "__main__":
post, user, num_lines= read_data('pw_posts.csv')
print "message:", num_lines, "post:", len(post), "user:", len(user) #214348, 20310, 3844
begin = time.time()
res1 = find_sim_post(post, user)
end = time.time()
print "finding similar posts is done", end-begin
经过翻译的 Lisp程序:
(defun split-by-colon (string)
(loop for i = 0 then (1+ j)
as j = (position #\; string :start i)
collect (subseq string i j)
while j))
(defun read-src ()
(let ((in (open "myprog/pw_posts.csv" :if-does-not-exist nil))
(data '()))
(when in
(loop for line = (read-line in nil)
while line do (push (split-by-colon line) data))
(close in))
data))
(defun get-dic (data)
(let ((post (make-hash-table :test 'equal))
(user (make-hash-table :test 'equal)))
(dolist (pair data)
(progn
(if (gethash (car pair) post)
(push (second pair) (gethash (car pair) post))
(setf (gethash (car pair) post) (cdr pair))))
(if (gethash (second pair) user)
(push (car pair) (gethash (second pair) user))
(setf (gethash (second pair) user) (list (car pair)))))
(list post user)))
(defvar *DATA* nil)
(defvar *post* nil)
(defvar *user* nil)
(defun load-data ()
(let ((d (get-dic (read-src))))
(format t "User count: ~A~%Post count: ~A~%" (hash-table-count (car d))
(hash-table-count (second d)))
(setq *DATA* d)
(setq *post* (car d))
(setq *user* (second d)))
t)
(defun cooccurrence (pref1 pref2)
(length (intersection
(remove-duplicates pref1)
(remove-duplicates pref2) :test 'equal)))
(defun sort-it (list)
(sort list #'(lambda (x y)
(> (second x)(second y)))))
(defun proc-res-list (list)
(let ((res (subseq (sort-it list) 1 11)))
(format t " ~A~%" res)
res))
(defun all-sim (data)
(let ((res (make-hash-table :test 'equal))
(i 0))
(loop for k being the hash-keys in data using (hash-value v)
do (progn
(setq i (+ i 1))
(format t "~A- ~A : " i k)
(setf (gethash k res) '())
(loop for k2 being the hash-keys in data using (hash-value v2)
do (push (list k2 (cooccurrence v v2)) (gethash k res)))
(setf (gethash k res) (proc-res-list (gethash k res)))))
res))
(defun write-res (data filename)
(let ((out (open filename :direction :output :if-exists :supersede)))
(when out
(loop for k being the hash-key in data using (hash-value v)
do (progn
(write-string k out)
(write-string ":" out)
(dolist (p v) (progn
(write-string (car p) out)
(write-string " " out)))
(write-line "" out)))
(close out))))
(defun calc ()
(progn
(load-data)
(write-res (all-sim *post*) "post_res.dat")
(write-res (all-sim *user*) "user_res.dat")))
python的程序执行时间为10分种,而Lisp的需要40分钟。.
肯定什么地方搞错了,应该还可以继续优化的。