jianghuyihei commited on
Commit
264c5ec
1 Parent(s): 789383a
Files changed (1) hide show
  1. searcher/sementic_search.py +11 -8
searcher/sementic_search.py CHANGED
@@ -132,6 +132,15 @@ class SementicSearcher:
132
  return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
133
 
134
  def read_arxiv_from_path(self, pdf_path):
 
 
 
 
 
 
 
 
 
135
  try:
136
  article_dict = scipdf.parse_pdf_to_dict(pdf_path)
137
  except Exception as e:
@@ -285,10 +294,7 @@ Abstract: {paper['abstract']}
285
  abstract = result['abstract']
286
  citationCount = result['citationCount']
287
  year = result['year']
288
- try:
289
- article = scipdf.parse_pdf_to_dict(content)
290
- except Exception as e:
291
- article = None
292
  if not article:
293
  continue
294
  final_results.append(Result(title,abstract,article,citationCount,year))
@@ -357,10 +363,7 @@ Abstract: {paper['abstract']}
357
  url = paper[2]
358
  content = await self.download_pdf_async(url)
359
  if content:
360
- try:
361
- article = scipdf.parse_pdf_to_dict(content)
362
- except Exception as e:
363
- article = None
364
  if not article:
365
  continue
366
  result = Result(paper[0],paper[1],article,paper[3],paper[4])
 
132
  return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
133
 
134
  def read_arxiv_from_path(self, pdf_path):
135
+ def is_pdf(binary_data):
136
+ pdf_header = b'%PDF-'
137
+ return binary_data.startswith(pdf_header)
138
+ try:
139
+ flag = is_pdf(pdf_path)
140
+ if not flag:
141
+ return None
142
+ except Exception as e:
143
+ pass
144
  try:
145
  article_dict = scipdf.parse_pdf_to_dict(pdf_path)
146
  except Exception as e:
 
294
  abstract = result['abstract']
295
  citationCount = result['citationCount']
296
  year = result['year']
297
+ article = self.read_arxiv_from_path(content)
 
 
 
298
  if not article:
299
  continue
300
  final_results.append(Result(title,abstract,article,citationCount,year))
 
363
  url = paper[2]
364
  content = await self.download_pdf_async(url)
365
  if content:
366
+ article = self.read_arxiv_from_path(content)
 
 
 
367
  if not article:
368
  continue
369
  result = Result(paper[0],paper[1],article,paper[3],paper[4])