Spaces:
Running
Running
jianghuyihei
commited on
Commit
•
264c5ec
1
Parent(s):
789383a
fix
Browse files- searcher/sementic_search.py +11 -8
searcher/sementic_search.py
CHANGED
@@ -132,6 +132,15 @@ class SementicSearcher:
|
|
132 |
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
|
133 |
|
134 |
def read_arxiv_from_path(self, pdf_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
try:
|
136 |
article_dict = scipdf.parse_pdf_to_dict(pdf_path)
|
137 |
except Exception as e:
|
@@ -285,10 +294,7 @@ Abstract: {paper['abstract']}
|
|
285 |
abstract = result['abstract']
|
286 |
citationCount = result['citationCount']
|
287 |
year = result['year']
|
288 |
-
|
289 |
-
article = scipdf.parse_pdf_to_dict(content)
|
290 |
-
except Exception as e:
|
291 |
-
article = None
|
292 |
if not article:
|
293 |
continue
|
294 |
final_results.append(Result(title,abstract,article,citationCount,year))
|
@@ -357,10 +363,7 @@ Abstract: {paper['abstract']}
|
|
357 |
url = paper[2]
|
358 |
content = await self.download_pdf_async(url)
|
359 |
if content:
|
360 |
-
|
361 |
-
article = scipdf.parse_pdf_to_dict(content)
|
362 |
-
except Exception as e:
|
363 |
-
article = None
|
364 |
if not article:
|
365 |
continue
|
366 |
result = Result(paper[0],paper[1],article,paper[3],paper[4])
|
|
|
132 |
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
|
133 |
|
134 |
def read_arxiv_from_path(self, pdf_path):
|
135 |
+
def is_pdf(binary_data):
|
136 |
+
pdf_header = b'%PDF-'
|
137 |
+
return binary_data.startswith(pdf_header)
|
138 |
+
try:
|
139 |
+
flag = is_pdf(pdf_path)
|
140 |
+
if not flag:
|
141 |
+
return None
|
142 |
+
except Exception as e:
|
143 |
+
pass
|
144 |
try:
|
145 |
article_dict = scipdf.parse_pdf_to_dict(pdf_path)
|
146 |
except Exception as e:
|
|
|
294 |
abstract = result['abstract']
|
295 |
citationCount = result['citationCount']
|
296 |
year = result['year']
|
297 |
+
article = self.read_arxiv_from_path(content)
|
|
|
|
|
|
|
298 |
if not article:
|
299 |
continue
|
300 |
final_results.append(Result(title,abstract,article,citationCount,year))
|
|
|
363 |
url = paper[2]
|
364 |
content = await self.download_pdf_async(url)
|
365 |
if content:
|
366 |
+
article = self.read_arxiv_from_path(content)
|
|
|
|
|
|
|
367 |
if not article:
|
368 |
continue
|
369 |
result = Result(paper[0],paper[1],article,paper[3],paper[4])
|