Spaces:
Running
Running
Commit
β’
26240b0
0
Parent(s):
yo
Browse files- .gitattributes +35 -0
- Dockerfile +20 -0
- README.md +10 -0
- index.html +36 -0
- package.json +21 -0
- script.js +42 -0
- server.js +32 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use an official Node.js runtime as the base image
|
2 |
+
FROM node:14
|
3 |
+
|
4 |
+
# Set the working directory in the container
|
5 |
+
WORKDIR /usr/src/app
|
6 |
+
|
7 |
+
# Copy package.json and package-lock.json to the working directory
|
8 |
+
COPY package*.json ./
|
9 |
+
|
10 |
+
# Install the application dependencies
|
11 |
+
RUN npm install
|
12 |
+
|
13 |
+
# Copy the rest of the application code to the working directory
|
14 |
+
COPY . .
|
15 |
+
|
16 |
+
# Expose the port that the app runs on
|
17 |
+
EXPOSE 3000
|
18 |
+
|
19 |
+
# Define the command to run the application
|
20 |
+
CMD ["node", "server.js"]
|
README.md
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Dom To Semantic Markdown
|
3 |
+
emoji: π
|
4 |
+
colorFrom: gray
|
5 |
+
colorTo: blue
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
---
|
9 |
+
|
10 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
index.html
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>DOM to Semantic Markdown Converter</title>
|
7 |
+
<script src="https://cdn.tailwindcss.com"></script>
|
8 |
+
</head>
|
9 |
+
<body>
|
10 |
+
<div class="w-dvh grid h-dvh grid-rows-[50px,1fr] overflow-hidden">
|
11 |
+
<div class="flex p-2 items-center gap-5 whitespace-nowrap">
|
12 |
+
<input type="url" id="url-input" placeholder="Enter URL" class="bg-gray-100 h-8 w-96 rounded-lg px-2" required>
|
13 |
+
<label>
|
14 |
+
<input type="checkbox" id="extract-main-content"> Extract main content
|
15 |
+
</label>
|
16 |
+
<label>
|
17 |
+
<input type="checkbox" id="refify-urls"> Refify URLs
|
18 |
+
</label>
|
19 |
+
<label>
|
20 |
+
<input type="checkbox" id="enable-table-column-tracking"> Enable table column tracking
|
21 |
+
</label>
|
22 |
+
<button type="submit" class="bg-black px-4 text-white h-8" form="converter-form">Convert to semantic markdown</button>
|
23 |
+
</div>
|
24 |
+
<div class="bg-gray-100 p-4 overflow-auto text-sm">
|
25 |
+
<pre id="markdown-output" class="whitespace-pre-wrap"></pre>
|
26 |
+
</div>
|
27 |
+
</div>
|
28 |
+
|
29 |
+
<form id="converter-form" class="hidden">
|
30 |
+
<input type="text" id="website-domain" placeholder="Website domain">
|
31 |
+
</form>
|
32 |
+
|
33 |
+
<script src="node_modules/dom-to-semantic-markdown/dist/browser/bundle.js"></script>
|
34 |
+
<script type="module" src="script.js"></script>
|
35 |
+
</body>
|
36 |
+
</html>
|
package.json
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"name": "dom-to-semantic-markdown",
|
3 |
+
"version": "1.0.0",
|
4 |
+
"description": "--- title: Dom To Semantic Markdown emoji: π colorFrom: gray colorTo: blue sdk: docker pinned: false ---",
|
5 |
+
"main": "server.js",
|
6 |
+
"scripts": {
|
7 |
+
"start": "node server.js",
|
8 |
+
"dev": "nodemon server.js",
|
9 |
+
"test": "echo \"Error: no test specified\" && exit 1"
|
10 |
+
},
|
11 |
+
"author": "",
|
12 |
+
"license": "ISC",
|
13 |
+
"dependencies": {
|
14 |
+
"dom-to-semantic-markdown": "^1.0.11",
|
15 |
+
"express": "^4.19.2",
|
16 |
+
"axios": "^0.21.1"
|
17 |
+
},
|
18 |
+
"devDependencies": {
|
19 |
+
"nodemon": "^2.0.22"
|
20 |
+
}
|
21 |
+
}
|
script.js
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
const { convertHtmlToMarkdown } = htmlToSMD;
|
2 |
+
|
3 |
+
document
|
4 |
+
.getElementById("converter-form")
|
5 |
+
.addEventListener("submit", async function (e) {
|
6 |
+
e.preventDefault();
|
7 |
+
|
8 |
+
const urlInput = document.getElementById("url-input").value;
|
9 |
+
const markdownOutput = document.getElementById("markdown-output");
|
10 |
+
const extractMainContent = document.getElementById(
|
11 |
+
"extract-main-content"
|
12 |
+
).checked;
|
13 |
+
const refifyUrls = document.getElementById("refify-urls").checked;
|
14 |
+
const enableTableColumnTracking = document.getElementById(
|
15 |
+
"enable-table-column-tracking"
|
16 |
+
).checked;
|
17 |
+
const websiteDomain = document.getElementById("website-domain").value;
|
18 |
+
|
19 |
+
const options = {
|
20 |
+
extractMainContent,
|
21 |
+
refifyUrls,
|
22 |
+
enableTableColumnTracking,
|
23 |
+
websiteDomain: websiteDomain || undefined,
|
24 |
+
};
|
25 |
+
|
26 |
+
try {
|
27 |
+
// Fetch HTML content from the server
|
28 |
+
const response = await fetch(
|
29 |
+
`/fetch-html?url=${encodeURIComponent(urlInput)}`
|
30 |
+
);
|
31 |
+
if (!response.ok) {
|
32 |
+
throw new Error("Failed to fetch HTML content");
|
33 |
+
}
|
34 |
+
const htmlContent = await response.text();
|
35 |
+
|
36 |
+
// Convert HTML to Markdown
|
37 |
+
const markdown = await convertHtmlToMarkdown(htmlContent, options);
|
38 |
+
markdownOutput.textContent = markdown;
|
39 |
+
} catch (error) {
|
40 |
+
markdownOutput.textContent = "Error: " + error.message;
|
41 |
+
}
|
42 |
+
});
|
server.js
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
const express = require("express");
|
2 |
+
const path = require("path");
|
3 |
+
const axios = require("axios");
|
4 |
+
const app = express();
|
5 |
+
const port = process.env.PORT || 3000;
|
6 |
+
|
7 |
+
// Serve static files from the current directory
|
8 |
+
app.use(express.static(__dirname));
|
9 |
+
|
10 |
+
// Route to fetch HTML content from a given URL
|
11 |
+
app.get("/fetch-html", async (req, res) => {
|
12 |
+
const url = req.query.url;
|
13 |
+
if (!url) {
|
14 |
+
return res.status(400).json({ error: "URL parameter is required" });
|
15 |
+
}
|
16 |
+
|
17 |
+
try {
|
18 |
+
const response = await axios.get(url);
|
19 |
+
res.send(response.data);
|
20 |
+
} catch (error) {
|
21 |
+
res.status(500).json({ error: "Failed to fetch HTML content" });
|
22 |
+
}
|
23 |
+
});
|
24 |
+
|
25 |
+
// Send index.html for any other routes
|
26 |
+
app.get("*", (req, res) => {
|
27 |
+
res.sendFile(path.resolve(__dirname, "index.html"));
|
28 |
+
});
|
29 |
+
|
30 |
+
app.listen(port, () => {
|
31 |
+
console.log(`Server is running on port ${port}`);
|
32 |
+
});
|