silvanocerza commited on
Commit
8d18428
1 Parent(s): 6b3c370

Rework download and indexing to save metadata

Browse files
Files changed (1) hide show
  1. main.py +26 -6
main.py CHANGED
@@ -39,15 +39,30 @@ DOCUMENTATIONS = [
39
 
40
  @st.cache_data(show_spinner=False)
41
  def fetch(documentations: List[Tuple[str, str, str]]):
42
- paths = []
43
  for name, url, pattern in documentations:
44
  st.write(f"Fetching {name} repository")
45
  repo = Path(__file__).parent / "downloaded_docs" / name
46
  if not repo.exists():
47
  subprocess.run(["git", "clone", "--depth", "1", url, str(repo)], check=True)
48
- paths.extend(repo.glob(pattern))
49
-
50
- return paths
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
 
53
  @st.cache_resource(show_spinner=False)
@@ -76,8 +91,13 @@ def index_files(files):
76
  indexing_pipeline.connect("cleaner", "splitter")
77
  indexing_pipeline.connect("splitter", "writer")
78
 
79
- # And now we clone and save the documentation in our MemoryDocumentStore
80
- indexing_pipeline.run({"converter": {"paths": files}})
 
 
 
 
 
81
 
82
 
83
  def search(question: str) -> GeneratedAnswer:
 
39
 
40
  @st.cache_data(show_spinner=False)
41
  def fetch(documentations: List[Tuple[str, str, str]]):
42
+ files = []
43
  for name, url, pattern in documentations:
44
  st.write(f"Fetching {name} repository")
45
  repo = Path(__file__).parent / "downloaded_docs" / name
46
  if not repo.exists():
47
  subprocess.run(["git", "clone", "--depth", "1", url, str(repo)], check=True)
48
+ res = subprocess.run(
49
+ ["git", "rev-parse", "--abbrev-ref", "HEAD"],
50
+ check=True,
51
+ capture_output=True,
52
+ encoding="utf-8",
53
+ )
54
+ branch = res.stdout.strip()
55
+ for p in repo.glob(pattern):
56
+ data = {
57
+ "path": p,
58
+ "metadata": {
59
+ "url_source": f"{url}/tree/{branch}/{p.relative_to(repo)}",
60
+ "suffix": p.suffix,
61
+ },
62
+ }
63
+ files.append(data)
64
+
65
+ return files
66
 
67
 
68
  @st.cache_resource(show_spinner=False)
 
91
  indexing_pipeline.connect("cleaner", "splitter")
92
  indexing_pipeline.connect("splitter", "writer")
93
 
94
+ # And now we save the documentation in our MemoryDocumentStore
95
+ paths = []
96
+ metadata = []
97
+ for f in files:
98
+ paths.append(f["path"])
99
+ metadata.append(f["metadata"])
100
+ indexing_pipeline.run({"converter": {"paths": paths, "metadata": metadata}})
101
 
102
 
103
  def search(question: str) -> GeneratedAnswer: