Spaces:

DAMO-NLP-SG
/

CoI_Agent

Running

jianghuyihei commited on Oct 12

Commit

264c5ec

•

1 Parent(s): 789383a

fix

Files changed (1) hide show

searcher/sementic_search.py CHANGED Viewed

@@ -132,6 +132,15 @@ class SementicSearcher:
         return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
     def read_arxiv_from_path(self, pdf_path):
         try:
             article_dict = scipdf.parse_pdf_to_dict(pdf_path)
         except Exception as e:
@@ -285,10 +294,7 @@ Abstract: {paper['abstract']}
                 abstract = result['abstract']
                 citationCount = result['citationCount']
                 year = result['year']
-                try:
-                    article = scipdf.parse_pdf_to_dict(content)
-                except Exception as e:
-                    article = None
                 if not article:
                     continue
                 final_results.append(Result(title,abstract,article,citationCount,year))
@@ -357,10 +363,7 @@ Abstract: {paper['abstract']}
             url = paper[2]
             content = await self.download_pdf_async(url)
             if content:
-                try:
-                    article = scipdf.parse_pdf_to_dict(content)
-                except Exception as e:
-                    article = None
                 if not article:
                     continue
                 result = Result(paper[0],paper[1],article,paper[3],paper[4])

         return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
     def read_arxiv_from_path(self, pdf_path):
+        def is_pdf(binary_data):
+            pdf_header = b'%PDF-'
+            return binary_data.startswith(pdf_header)
+        try:
+            flag = is_pdf(pdf_path)
+            if not flag:
+                return None
+        except Exception as e:
+            pass
         try:
             article_dict = scipdf.parse_pdf_to_dict(pdf_path)
         except Exception as e:
                 abstract = result['abstract']
                 citationCount = result['citationCount']
                 year = result['year']
+                article = self.read_arxiv_from_path(content)
                 if not article:
                     continue
                 final_results.append(Result(title,abstract,article,citationCount,year))
             url = paper[2]
             content = await self.download_pdf_async(url)
             if content:
+                article = self.read_arxiv_from_path(content)
                 if not article:
                     continue
                 result = Result(paper[0],paper[1],article,paper[3],paper[4])