Spaces:
Sleeping
Sleeping
Nicky Nicolson
commited on
Commit
•
e6d4f1a
1
Parent(s):
7ba1d6e
Initial revision
Browse files- .gitignore +1 -1
- Dockerfile +9 -17
- README.md +2 -2
- extractcollectorname.py +0 -36
- getDownloadMetadata.py +0 -28
- metadata.json +5 -17
- requirements.txt +2 -8
- tab2csv.py +0 -112
.gitignore
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
env
|
2 |
-
data
|
|
|
1 |
env
|
2 |
+
data
|
Dockerfile
CHANGED
@@ -2,32 +2,24 @@ FROM python:3.11
|
|
2 |
|
3 |
# Download ID is set as a space variable
|
4 |
# By default it is a download of all Solanum preserved specimen records (c600K)
|
5 |
-
ARG
|
|
|
6 |
WORKDIR /code
|
7 |
|
8 |
COPY ./requirements.txt /code/requirements.txt
|
9 |
|
10 |
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
11 |
|
12 |
-
#
|
13 |
RUN mkdir /data
|
14 |
-
|
15 |
-
RUN
|
16 |
-
RUN
|
17 |
-
RUN ls -lh /data
|
18 |
-
COPY ./tab2csv.py /code/tab2csv.py
|
19 |
|
20 |
-
|
21 |
-
RUN
|
22 |
-
RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
|
23 |
-
RUN ls -l /code
|
24 |
-
RUN sqlite-utils tables /code/gbifocc.db --counts
|
25 |
-
RUN sqlite-utils enable-fts /code/gbifocc.db gbifocc collectorNameAndNumber
|
26 |
-
RUN chmod 755 /code/gbifocc.db
|
27 |
|
28 |
# Create datasette metadata file
|
29 |
-
COPY ./getDownloadMetadata.py /code/getDownloadMetadata.py
|
30 |
COPY ./metadata.json /code/metadata.json
|
31 |
-
RUN python getDownloadMetadata.py /code/metadata.json /code/metadata.json --download_id=$GBIF_DOWNLOAD_ID
|
32 |
|
33 |
-
CMD ["datasette", "/code/
|
|
|
2 |
|
3 |
# Download ID is set as a space variable
|
4 |
# By default it is a download of all Solanum preserved specimen records (c600K)
|
5 |
+
ARG IHDELTA_REPO_URL=$IHDELTA_REPO_URL
|
6 |
+
ARG START_AFTER_COMMIT=$START_AFTER_COMMIT
|
7 |
WORKDIR /code
|
8 |
|
9 |
COPY ./requirements.txt /code/requirements.txt
|
10 |
|
11 |
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
12 |
|
13 |
+
# Clone ihdelta repo
|
14 |
RUN mkdir /data
|
15 |
+
RUN mkdir /data/ihdelta
|
16 |
+
RUN git clone ${IHDELTA_REPO_URL} /data/ihdelta
|
17 |
+
RUN git-history file --start-after ${START_AFTER_COMMIT} --csv --dialect unix --repo /data/ihdelta --id irn /code/ihdelta.db /data/ihdelta/ih-institutions.csv
|
|
|
|
|
18 |
|
19 |
+
RUN sqlite-utils tables /code/ihdelta.db --counts
|
20 |
+
RUN chmod 755 /code/ihdelta.db
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
# Create datasette metadata file
|
|
|
23 |
COPY ./metadata.json /code/metadata.json
|
|
|
24 |
|
25 |
+
CMD ["datasette", "/code/ihdelta.db", "-m", "/code/metadata.json", "--host", "0.0.0.0", "--port", "7860", "--setting", "sql_time_limit_ms", "3500"]
|
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
colorFrom: red
|
5 |
colorTo: purple
|
6 |
sdk: docker
|
|
|
1 |
---
|
2 |
+
title: Index Herbariorum
|
3 |
+
emoji: 🏛
|
4 |
colorFrom: red
|
5 |
colorTo: purple
|
6 |
sdk: docker
|
extractcollectorname.py
DELETED
@@ -1,36 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
import pandas as pd
|
3 |
-
import bananompy
|
4 |
-
from tqdm import tqdm
|
5 |
-
tqdm.pandas()
|
6 |
-
|
7 |
-
def getFirstFamilyName(s):
|
8 |
-
firstFamilyName = None
|
9 |
-
parsed = bananompy.parse(s)
|
10 |
-
try:
|
11 |
-
firstFamilyName = parsed[0]['parsed'][0]['family']
|
12 |
-
except:
|
13 |
-
pass
|
14 |
-
return firstFamilyName
|
15 |
-
|
16 |
-
if __name__ == '__main__':
|
17 |
-
parser = argparse.ArgumentParser()
|
18 |
-
parser.add_argument("inputfile")
|
19 |
-
parser.add_argument("outputfile")
|
20 |
-
args = parser.parse_args()
|
21 |
-
|
22 |
-
df = pd.read_csv(args.inputfile,
|
23 |
-
encoding='utf8',
|
24 |
-
keep_default_na=False,
|
25 |
-
na_values=['NONE',''],
|
26 |
-
on_bad_lines='skip',
|
27 |
-
sep=',')
|
28 |
-
# Extract unique recordedBy values
|
29 |
-
df_rb = df[['recordedBy']].drop_duplicates()
|
30 |
-
df_rb['recordedBy_first_familyname'] = df_rb.recordedBy.progress_apply(getFirstFamilyName)
|
31 |
-
# Apply back to main dataframe
|
32 |
-
df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left')
|
33 |
-
# Add column holding collector name and number
|
34 |
-
mask = (df.recordNumber.notnull())
|
35 |
-
df.loc[mask,'collectorNameAndNumber']=df[mask].apply(lambda row: '{} {}'.format(row['recordedBy_first_familyname'],row['recordNumber']),axis=1)
|
36 |
-
df.to_csv(args.outputfile, index=False, sep=',')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
getDownloadMetadata.py
DELETED
@@ -1,28 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
from pygbif import occurrences as occ
|
3 |
-
import json
|
4 |
-
|
5 |
-
|
6 |
-
licenses = {'http://creativecommons.org/licenses/by-nc/4.0/legalcode':'CC BY-NC 4.0'}
|
7 |
-
if __name__ == '__main__':
|
8 |
-
parser = argparse.ArgumentParser()
|
9 |
-
parser.add_argument("inputfile")
|
10 |
-
parser.add_argument("--download_id", type=str)
|
11 |
-
parser.add_argument("outputfile")
|
12 |
-
|
13 |
-
args = parser.parse_args()
|
14 |
-
|
15 |
-
datasette_metadata = None
|
16 |
-
with open(args.inputfile, 'r') as f_in:
|
17 |
-
datasette_metadata = json.load(f_in)
|
18 |
-
|
19 |
-
gbif_metadata = occ.download_meta(key = args.download_id)
|
20 |
-
license_url = gbif_metadata['license']
|
21 |
-
if license_url in licenses:
|
22 |
-
datasette_metadata['license'] = licenses[license_url]
|
23 |
-
datasette_metadata['license_url'] = license_url
|
24 |
-
datasette_metadata['source_url'] = 'https://doi.org/{}'.format(gbif_metadata['doi'])
|
25 |
-
|
26 |
-
datasette_metadata_json = json.dumps(datasette_metadata, indent=4)
|
27 |
-
with open(args.outputfile, 'w') as f_out:
|
28 |
-
f_out.write(datasette_metadata_json)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
metadata.json
CHANGED
@@ -1,27 +1,15 @@
|
|
1 |
{
|
2 |
-
"title": "
|
3 |
-
"description": "This is a datasette instance containing
|
4 |
-
"source": "
|
5 |
"databases": {
|
6 |
"gbifocc": {
|
7 |
"tables": {
|
8 |
"gbifocc": {
|
9 |
"plugins": {
|
10 |
-
"datasette-reconcile": {
|
11 |
-
"id_field": "gbifID",
|
12 |
-
"name_field": "collectorNameAndNumber",
|
13 |
-
"type_field": "basisOfRecord",
|
14 |
-
"type_default": [{
|
15 |
-
"id": "basisOfRecord",
|
16 |
-
"name": "PRESERVED_SPECIMEN"
|
17 |
-
}],
|
18 |
-
"max_limit": 5,
|
19 |
-
"service_name": "GBIF specimens reconciliation",
|
20 |
-
"view_url": "https://gbif.org/occurrence/{{id}}"
|
21 |
-
},
|
22 |
"datasette-cluster-map": {
|
23 |
-
"latitude_column": "
|
24 |
-
"longitude_column": "
|
25 |
}
|
26 |
}
|
27 |
}
|
|
|
1 |
{
|
2 |
+
"title": "Revision controlled Index Herbariorum",
|
3 |
+
"description": "This is a datasette instance containing Index Herbariorum, gathered each week using the git scraping data pattern.",
|
4 |
+
"source": "Index Herbariorum (NYBG)",
|
5 |
"databases": {
|
6 |
"gbifocc": {
|
7 |
"tables": {
|
8 |
"gbifocc": {
|
9 |
"plugins": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
"datasette-cluster-map": {
|
11 |
+
"latitude_column": "location.lat",
|
12 |
+
"longitude_column": "location.lon"
|
13 |
}
|
14 |
}
|
15 |
}
|
requirements.txt
CHANGED
@@ -1,10 +1,4 @@
|
|
1 |
datasette
|
2 |
-
|
3 |
datasette-cluster-map
|
4 |
-
sqlite-utils
|
5 |
-
csvs-to-sqlite
|
6 |
-
pandas==1.5.3
|
7 |
-
bananompy
|
8 |
-
datasette-jellyfish
|
9 |
-
tqdm
|
10 |
-
pygbif
|
|
|
1 |
datasette
|
2 |
+
git-history
|
3 |
datasette-cluster-map
|
4 |
+
sqlite-utils
|
|
|
|
|
|
|
|
|
|
|
|
tab2csv.py
DELETED
@@ -1,112 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
import pandas as pd
|
3 |
-
import requests
|
4 |
-
from pygbif import occurrences as occ
|
5 |
-
from tqdm import tqdm
|
6 |
-
tqdm.pandas()
|
7 |
-
import os.path
|
8 |
-
|
9 |
-
def getFirstFamilyName(recordedBy):
|
10 |
-
firstFamilyName = None
|
11 |
-
parsed = bananompy.parse(recordedBy)
|
12 |
-
try:
|
13 |
-
firstFamilyName = parsed[0]['parsed'][0]['family']
|
14 |
-
except:
|
15 |
-
pass
|
16 |
-
return firstFamilyName
|
17 |
-
|
18 |
-
def getFirstFamilyNames(recordedBy_l):
|
19 |
-
# post to bionomia
|
20 |
-
bionomia_parse_endpoint_url = "https://api.bionomia.net/parse.json"
|
21 |
-
data = dict()
|
22 |
-
data['names'] = '\r\n'.join(recordedBy_l)
|
23 |
-
r = requests.post(bionomia_parse_endpoint_url, data=data)
|
24 |
-
parsed_results = r.json()
|
25 |
-
results = dict()
|
26 |
-
for parsed_result in parsed_results:
|
27 |
-
try:
|
28 |
-
results[parsed_result['original']] = parsed_result['parsed'][0]['family']
|
29 |
-
except:
|
30 |
-
results[parsed_result['original']] = None
|
31 |
-
return results
|
32 |
-
|
33 |
-
def getFirstFamilyNameBulk(df,
|
34 |
-
recordedByColName="recordedBy",
|
35 |
-
firstFamilyNameColName="recordedBy_first_familyname",
|
36 |
-
batchsize=500):
|
37 |
-
results = dict()
|
38 |
-
recordedBy_l = []
|
39 |
-
for s in tqdm(df[recordedByColName].values):
|
40 |
-
if len(recordedBy_l) == batchsize:
|
41 |
-
# send it
|
42 |
-
results.update(getFirstFamilyNames(recordedBy_l))
|
43 |
-
# clear for next iteration
|
44 |
-
recordedBy_l = []
|
45 |
-
recordedBy_l.append(s)
|
46 |
-
if len(recordedBy_l) > 0:
|
47 |
-
results.update(getFirstFamilyNames(recordedBy_l))
|
48 |
-
df[firstFamilyNameColName] = df[recordedByColName].map(results)
|
49 |
-
return df
|
50 |
-
|
51 |
-
GBIF_DOWNLOAD_DESCRIBE_URL_SIMPLE_CSV = 'https://api.gbif.org/v1/occurrence/download/describe/simpleCsv'
|
52 |
-
GBIF_DOWNLOAD_DESCRIBE_URL_DWCA = 'https://api.gbif.org/v1/occurrence/download/describe/dwca'
|
53 |
-
|
54 |
-
def getGbifDownloadColumnNames(download_format):
|
55 |
-
column_names = None
|
56 |
-
if download_format == 'SIMPLE_CSV':
|
57 |
-
r = requests.get(GBIF_DOWNLOAD_DESCRIBE_URL_SIMPLE_CSV)
|
58 |
-
columns_metadata = r.json()
|
59 |
-
column_names = [column_metadata['name'] for column_metadata in columns_metadata['fields']]
|
60 |
-
elif download_format == 'DWCA':
|
61 |
-
r = requests.get(GBIF_DOWNLOAD_DESCRIBE_URL_DWCA)
|
62 |
-
columns_metadata = r.json()
|
63 |
-
column_names = [column_metadata['name'] for column_metadata in columns_metadata['verbatim']['fields']]
|
64 |
-
return column_names
|
65 |
-
|
66 |
-
|
67 |
-
if __name__ == '__main__':
|
68 |
-
parser = argparse.ArgumentParser()
|
69 |
-
parser.add_argument("data_dir")
|
70 |
-
parser.add_argument("download_id")
|
71 |
-
parser.add_argument("-c","--createcols", action='store_true')
|
72 |
-
parser.add_argument("-l","--limit", type=int)
|
73 |
-
parser.add_argument("outputfilename")
|
74 |
-
args = parser.parse_args()
|
75 |
-
|
76 |
-
# Determine format of datafile by accessing download metadata from GBIF API
|
77 |
-
gbif_metadata = occ.download_meta(key = args.download_id)
|
78 |
-
download_format = gbif_metadata['request']['format']
|
79 |
-
# The GBIF download format determines:
|
80 |
-
# (1) the columns in the download, SIMPLE_CSV being a much restricted set
|
81 |
-
# of columns than DWCA
|
82 |
-
# (2) The name of the occurrence data file, SIMPLE_CSV : '[download_id].csv'
|
83 |
-
# DWCA : 'occurrence.txt'
|
84 |
-
inputfilename = None
|
85 |
-
column_names_simple_csv = getGbifDownloadColumnNames('SIMPLE_CSV')
|
86 |
-
column_names = None
|
87 |
-
if download_format == 'SIMPLE_CSV':
|
88 |
-
inputfilename = '{}.csv'.format(args.download_id)
|
89 |
-
column_names = column_names_simple_csv
|
90 |
-
elif download_format == 'DWCA':
|
91 |
-
inputfilename = 'occurrence.txt'
|
92 |
-
column_names_dwca = getGbifDownloadColumnNames('DWCA')
|
93 |
-
column_names = [column_name for column_name in column_names_dwca if column_name in column_names_simple_csv]
|
94 |
-
|
95 |
-
df = pd.read_csv(os.path.join(args.data_dir,inputfilename),
|
96 |
-
encoding='utf8',
|
97 |
-
keep_default_na=False,
|
98 |
-
on_bad_lines='skip',
|
99 |
-
sep='\t',
|
100 |
-
usecols=column_names,
|
101 |
-
nrows=args.limit)
|
102 |
-
if args.createcols:
|
103 |
-
# Extract unique recordedBy values
|
104 |
-
df_rb = df[['recordedBy']].drop_duplicates()
|
105 |
-
df_rb = getFirstFamilyNameBulk(df_rb)
|
106 |
-
#df_rb['recordedBy_first_familyname'] = df_rb.recordedBy.progress_apply(getFirstFamilyName)
|
107 |
-
# Apply back to main dataframe
|
108 |
-
df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left')
|
109 |
-
# Add column holding collector name and number
|
110 |
-
mask = (df.recordNumber.notnull())
|
111 |
-
df.loc[mask,'collectorNameAndNumber']=df[mask].apply(lambda row: '{} {}'.format(row['recordedBy_first_familyname'],row['recordNumber']),axis=1)
|
112 |
-
df.to_csv(os.path.join(args.data_dir,args.outputfilename), index=False, sep=',')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|