OSDN Git Service

implement file path id mode.
authorgn64_jp <gn64@rec10.org>
Wed, 17 Apr 2013 12:46:33 +0000 (21:46 +0900)
committergn64_jp <gn64@rec10.org>
Wed, 17 Apr 2013 12:46:33 +0000 (21:46 +0900)
index/pdf2xml.py

index 2a9369b..8810820 100644 (file)
@@ -10,12 +10,13 @@ import time
 import re
 #import unicode
 import hashlib
+#import base64
 import solr
 import yaml
 import sqlite3
 
-confstr="/etc/libre10.conf"
-sqlpath="/tmp/libre10.db"
+confstr="/etc/rec10pdf.conf"
+sqlpath="/tmp/rec10pdf.db"
 conf_dic=yaml.load(open(confstr).read())
 dbcon=sqlite3.connect(sqlpath,isolation_level=None)
 try:
@@ -66,7 +67,7 @@ def TEXT2solr(solrcon,titletxt,textpath,pagenum,pagemax,pdfpath,idnum):
     title_g=titletxt.split("_Part")[0]
     title_g_id=hashlib.sha224(title_g.encode("utf-8")).hexdigest()
     print titletxt+" : "+str(pagenum)+"/"+str(pagemax)
-    path_id=hashlib.sha224(os.path.abspath(pdfpath).encode("utf-8")).hexdigest()
+    path_id=idnum
     #print data1
     solrcon.add(id=idnum+"_"+str(pagenum),title=titletxt,title_group=title_g,
     title_group_id=title_g_id,page=int(pagenum),pagemax=int(pagemax),
@@ -79,11 +80,19 @@ argc=len(argvs)
 for st in argvs[1:]:
     #print os.path.splitext(st)
     if os.path.splitext(st)[1].upper() == ".PDF":
-        rd=hashlib.sha224(os.path.splitext(st)[0]).hexdigest()
-        rd2=hashlib.sha224(os.path.abspath(st)).hexdigest()
+        #rd=hashlib.sha224(os.path.splitext(st)[0]).hexdigest()
+        #rd2=hashlib.sha224(os.path.abspath(st)).hexdigest()
+        #rd = base64.b64encode(os.path.abspath(st))[]
+        rd = hashlib.md5()
+        rd.update(os.path.abspath(st))
+        rdl=rd.hexdigest()
         c = dbcon.cursor()
-        c.execute(u"select count(*) from pdffile where id=?",(rd2,))
-        if int(c.fetchone()[0])<1:
-            PDF2TEXT(st.decode("utf-8"),str(rd)+".txt",rd)
-    dbcon.close()
+        c.execute(u"select count(*) from pdffile where id=?",(rdl,))
+        filecount = c.fetchone()[0]
+        print st
+        print rdl
+        print filecount
+        if int(filecount)<1:
+            PDF2TEXT(st.decode("utf-8"),str(rdl)+".txt",rdl)
+dbcon.close()