Benjamin Bossan commited on
Commit
71965fb
1 Parent(s): 31610d3

Add a source snippet to the results

Browse files

This way it is easier to trace the source of the entry.

demo.py CHANGED
@@ -19,14 +19,17 @@ def check_status():
19
 
20
 
21
  def check_tags():
 
22
  try:
23
  tag_counts = {
24
  key.strip("#"): val
25
- for key, val in httpx.get("http://localhost:8080/tag_counts/").json().items()
26
  if key.strip("#")
27
  }
28
  sorted_tags = sorted(list(tag_counts.items()), key=lambda tup: -tup[1])
29
- result = "Most common tags: " + ", ".join(f"{t} ({c})" for t, c in sorted_tags[:5])
 
 
30
  except httpx.ConnectError:
31
  result = ""
32
  return result
@@ -45,6 +48,7 @@ def get_results(inputs: list[str]):
45
  texts.append(
46
  f"## {i}. author: {entry['author']}\n\n"
47
  f"Date: _{entry['date']}_\n\n"
 
48
  f"**Summary**: {entry['summary']}\n\n"
49
  f"tags: _{' '.join(entry['tags'])}_"
50
  )
@@ -64,9 +68,6 @@ Input currently supports:
64
  - a URL to a webpage
65
  - a URL to a youtube video (the video will be transcribed)
66
  - a URL to an image (the image description will be used)
67
-
68
- Processing can take a couple of minutes, the info box below gives an update on
69
- how many jobs are in the queue.
70
  """
71
 
72
 
@@ -78,8 +79,12 @@ def get_demo():
78
  btn_submit = gr.Button("Submit")
79
 
80
  # check job status
 
 
 
 
81
  gr.HTML(value=check_status, label="Status", every=3)
82
- gr.HTML(value=check_tags, label="Status", every=10)
83
 
84
  # check box of tags to filter on
85
  msg = "Enter hashtags to filter on (comma separated if multiple)"
 
19
 
20
 
21
  def check_tags():
22
+ url = "http://localhost:8080/tag_counts/"
23
  try:
24
  tag_counts = {
25
  key.strip("#"): val
26
+ for key, val in httpx.get(url).json().items()
27
  if key.strip("#")
28
  }
29
  sorted_tags = sorted(list(tag_counts.items()), key=lambda tup: -tup[1])
30
+ result = "Most common tags: " + ", ".join(
31
+ f"{t} ({c})" for t, c in sorted_tags[:5]
32
+ )
33
  except httpx.ConnectError:
34
  result = ""
35
  return result
 
48
  texts.append(
49
  f"## {i}. author: {entry['author']}\n\n"
50
  f"Date: _{entry['date']}_\n\n"
51
+ f"Source: _{entry['source_snippet']}_\n\n"
52
  f"**Summary**: {entry['summary']}\n\n"
53
  f"tags: _{' '.join(entry['tags'])}_"
54
  )
 
68
  - a URL to a webpage
69
  - a URL to a youtube video (the video will be transcribed)
70
  - a URL to an image (the image description will be used)
 
 
 
71
  """
72
 
73
 
 
79
  btn_submit = gr.Button("Submit")
80
 
81
  # check job status
82
+ gr.Markdown(
83
+ "Processing can take a couple of minutes, the info box below gives an "
84
+ "update on how many jobs are in the queue."
85
+ )
86
  gr.HTML(value=check_status, label="Status", every=3)
87
+ gr.HTML(value=check_tags, label="Status", every=9)
88
 
89
  # check box of tags to filter on
90
  msg = "Enter hashtags to filter on (comma separated if multiple)"
requests.org CHANGED
@@ -16,7 +16,7 @@ curl -X 'POST' \
16
  -H 'Content-Type: application/json' \
17
  -d '{
18
  "author": "ben",
19
- "content": "In literature discussing why ChatGPT is able to capture so much of our imagination, I often come across two narratives: Scale: throwing more data and compute at it. UX: moving from a prompt interface to a more natural chat interface. A narrative that is often glossed over in the demo frenzy is the incredible technical creativity that went into making models like ChatGPT work. One such cool idea is RLHF (Reinforcement Learning from Human Feedback): incorporating reinforcement learning and human feedback into NLP. RL has been notoriously difficult to work with, and therefore, mostly confined to gaming and simulated environments like Atari or MuJoCo. Just five years ago, both RL and NLP were progressing pretty much orthogonally – different stacks, different techniques, and different experimentation setups. It’s impressive to see it work in a new domain at a massive scale. So, how exactly does RLHF work? Why does it work? This post will discuss the answers to those questions."
20
  }'
21
  #+end_src
22
 
@@ -30,7 +30,7 @@ curl -X 'POST' \
30
  -H 'Content-Type: application/json' \
31
  -d '{
32
  "author": "ben",
33
- "content": "https://en.wikipedia.org/wiki/Goulburn_Street"
34
  }'
35
  #+end_src
36
 
@@ -44,12 +44,23 @@ curl -X 'POST' \
44
  -H 'Content-Type: application/json' \
45
  -d '{
46
  "author": "ben",
47
- "content": "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e1/Cattle_tyrant_%28Machetornis_rixosa%29_on_Capybara.jpg/1920px-Cattle_tyrant_%28Machetornis_rixosa%29_on_Capybara.jpg"
48
  }'
49
  #+end_src
50
 
51
  #+RESULTS:
52
- : Submitted job dc3da7b1d5aa47c38dc6713952104f5f
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  #+begin_src bash
55
  curl -X 'GET' \
 
16
  -H 'Content-Type: application/json' \
17
  -d '{
18
  "author": "ben",
19
+ "content": "Diffusers is the go-to library for state-of-the-art pretrained diffusion models for generating images, audio, and even 3D structures of molecules. Whether you’re looking for a simple inference solution or want to train your own diffusion model, 🤗 Diffusers is a modular toolbox that supports both. Our library is designed with a focus on usability over performance, simple over easy, and customizability over abstractions."
20
  }'
21
  #+end_src
22
 
 
30
  -H 'Content-Type: application/json' \
31
  -d '{
32
  "author": "ben",
33
+ "content": "https://en.wikipedia.org/wiki/Hugging_Face"
34
  }'
35
  #+end_src
36
 
 
44
  -H 'Content-Type: application/json' \
45
  -d '{
46
  "author": "ben",
47
+ "content": "https://images.openai.com/blob/8a2b0833-55f2-44d6-bf4f-85f9471078f5/Anastronautridingahorseinaphotorealisticstyle6.jpg"
48
  }'
49
  #+end_src
50
 
51
  #+RESULTS:
52
+ : Submitted job f37729bb36104ab4a23cefd0480e4862
53
+
54
+ #+begin_src bash
55
+ curl -X 'POST' \
56
+ 'http://localhost:8080/submit/' \
57
+ -H 'accept: application/json' \
58
+ -H 'Content-Type: application/json' \
59
+ -d '{
60
+ "author": "ben",
61
+ "content": "https://www.youtube.com/watch?v=H39Z_720T5s"
62
+ }'
63
+ #+end_src
64
 
65
  #+begin_src bash
66
  curl -X 'GET' \
src/gistillery/base.py CHANGED
@@ -13,6 +13,7 @@ class EntriesResult(BaseModel):
13
  id: str
14
  author: str
15
  summary: str
 
16
  tags: list[str]
17
  date: dt.datetime
18
 
 
13
  id: str
14
  author: str
15
  summary: str
16
+ source_snippet: str
17
  tags: list[str]
18
  date: dt.datetime
19
 
src/gistillery/db.py CHANGED
@@ -16,6 +16,7 @@ CREATE TABLE entries
16
  id TEXT PRIMARY KEY,
17
  author TEXT NOT NULL,
18
  source TEXT NOT NULL,
 
19
  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
20
  )
21
  """
 
16
  id TEXT PRIMARY KEY,
17
  author TEXT NOT NULL,
18
  source TEXT NOT NULL,
19
+ source_snippet TEXT NOT NULL,
20
  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
21
  )
22
  """
src/gistillery/webservice.py CHANGED
@@ -1,5 +1,6 @@
1
  import datetime as dt
2
  import logging
 
3
  import sqlite3
4
  import uuid
5
 
@@ -14,6 +15,8 @@ logger.setLevel(logging.DEBUG)
14
 
15
 
16
  app = FastAPI()
 
 
17
 
18
 
19
  # status
@@ -33,8 +36,12 @@ def submit_job(input: RequestInput) -> str:
33
  query = "INSERT INTO jobs (entry_id, status) VALUES (?, ?)"
34
  cursor.execute(query, (_id, "pending"))
35
  # create an entry
36
- query = "INSERT INTO entries (id, author, source) VALUES (?, ?, ?)"
37
- cursor.execute(query, (_id, input.author, input.content))
 
 
 
 
38
 
39
  return f"Submitted job {_id}"
40
 
@@ -81,7 +88,8 @@ def recent() -> list[EntriesResult]:
81
  # joined to a comma separated str
82
  cursor.execute("""
83
  SELECT
84
- e.id, e.author, s.summary, GROUP_CONCAT(t.tag, ",") tags, e.created_at
 
85
  FROM entries e
86
  JOIN summaries s ON e.id = s.entry_id
87
  JOIN tags t ON e.id = t.entry_id
@@ -92,9 +100,14 @@ def recent() -> list[EntriesResult]:
92
  results = cursor.fetchall()
93
 
94
  entries = []
95
- for _id, author, summary, tags, date in results:
96
  entry = EntriesResult(
97
- id=_id, author=author, summary=summary, tags=tags.split(","), date=date
 
 
 
 
 
98
  )
99
  entries.append(entry)
100
  return entries
@@ -110,7 +123,8 @@ def recent_tag(tag: str) -> list[EntriesResult]:
110
  cursor.execute(
111
  """
112
  SELECT
113
- e.id, e.author, s.summary, GROUP_CONCAT(t.tag, ",") tags, e.created_at
 
114
  FROM entries e
115
  JOIN summaries s ON e.id = s.entry_id
116
  JOIN tags t ON e.id = t.entry_id
@@ -126,9 +140,14 @@ def recent_tag(tag: str) -> list[EntriesResult]:
126
  results = cursor.fetchall()
127
 
128
  entries = []
129
- for _id, author, summary, tag, date in results:
130
  entry = EntriesResult(
131
- id=_id, author=author, summary=summary, tags=tag.split(","), date=date
 
 
 
 
 
132
  )
133
  entries.append(entry)
134
  return entries
 
1
  import datetime as dt
2
  import logging
3
+ import reprlib
4
  import sqlite3
5
  import uuid
6
 
 
15
 
16
 
17
  app = FastAPI()
18
+ aRepr = reprlib.Repr()
19
+ aRepr.maxstring = 140
20
 
21
 
22
  # status
 
36
  query = "INSERT INTO jobs (entry_id, status) VALUES (?, ?)"
37
  cursor.execute(query, (_id, "pending"))
38
  # create an entry
39
+ source_snippet = aRepr.repr(input.content).strip("'")
40
+ query = (
41
+ "INSERT INTO entries (id, author, source, source_snippet) VALUES "
42
+ "(?, ?, ?, ?)"
43
+ )
44
+ cursor.execute(query, (_id, input.author, input.content, source_snippet))
45
 
46
  return f"Submitted job {_id}"
47
 
 
88
  # joined to a comma separated str
89
  cursor.execute("""
90
  SELECT
91
+ e.id, e.author, e.created_at, e.source_snippet,
92
+ s.summary, GROUP_CONCAT(t.tag, ",") tags
93
  FROM entries e
94
  JOIN summaries s ON e.id = s.entry_id
95
  JOIN tags t ON e.id = t.entry_id
 
100
  results = cursor.fetchall()
101
 
102
  entries = []
103
+ for row in results:
104
  entry = EntriesResult(
105
+ id=row.id,
106
+ author=row.author,
107
+ summary=row.summary,
108
+ source_snippet=row.source_snippet,
109
+ tags=row.tags.split(","),
110
+ date=row.created_at,
111
  )
112
  entries.append(entry)
113
  return entries
 
123
  cursor.execute(
124
  """
125
  SELECT
126
+ e.id, e.author, e.source_snippet, e.created_at,
127
+ s.summary, GROUP_CONCAT(t.tag, ",") tags
128
  FROM entries e
129
  JOIN summaries s ON e.id = s.entry_id
130
  JOIN tags t ON e.id = t.entry_id
 
140
  results = cursor.fetchall()
141
 
142
  entries = []
143
+ for row in results:
144
  entry = EntriesResult(
145
+ id=row.id,
146
+ author=row.author,
147
+ summary=row.summary,
148
+ source_snippet=row.source_snippet,
149
+ tags=row.tags.split(","),
150
+ date=row.created_at,
151
  )
152
  entries.append(entry)
153
  return entries
src/gistillery/worker.py CHANGED
@@ -1,5 +1,5 @@
1
- import time
2
  import sys
 
3
  from dataclasses import dataclass
4
 
5
  from gistillery.base import JobInput
 
 
1
  import sys
2
+ import time
3
  from dataclasses import dataclass
4
 
5
  from gistillery.base import JobInput
tests/test_app.py CHANGED
@@ -213,6 +213,24 @@ class TestWebservice:
213
  assert resp1["summary"] == "this would"
214
  assert resp1["tags"] == sorted(["#this", "#would", "#be"])
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  def test_recent_tag_with_entries(self, client, registry):
217
  # submit 2 entries
218
  client.post(
 
213
  assert resp1["summary"] == "this would"
214
  assert resp1["tags"] == sorted(["#this", "#would", "#be"])
215
 
216
+ def test_recent_source_snippet_shortened(self, client, registry):
217
+ # submit 2 entries
218
+ client.post("/submit", json={"author": "alice", "content": "this is short"})
219
+ client.post(
220
+ "/submit",
221
+ json={"author": "bob", "content": "this is long " * 100},
222
+ )
223
+ self.process_jobs(registry)
224
+ resp = client.get("/recent").json()
225
+ resp = sorted(resp, key=lambda x: x["author"])
226
+
227
+ assert resp[0]["source_snippet"] == "this is short"
228
+ expected_shortened = (
229
+ "this is long this is long this is long this is long this is long th"
230
+ "...ng this is long this is long this is long this is long this is long "
231
+ )
232
+ assert resp[1]["source_snippet"] == expected_shortened
233
+
234
  def test_recent_tag_with_entries(self, client, registry):
235
  # submit 2 entries
236
  client.post(