Spaces:
Sleeping
Sleeping
Initial commit
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- Home.py +19 -0
- README.md +6 -7
- configs/_base_/default_runtime.py +17 -0
- configs/_base_/det_datasets/ctw1500.py +18 -0
- configs/_base_/det_datasets/icdar2015.py +18 -0
- configs/_base_/det_datasets/icdar2017.py +18 -0
- configs/_base_/det_datasets/synthtext.py +18 -0
- configs/_base_/det_datasets/toy_data.py +41 -0
- configs/_base_/det_models/dbnet_r18_fpnc.py +21 -0
- configs/_base_/det_models/dbnet_r50dcnv2_fpnc.py +23 -0
- configs/_base_/det_models/dbnetpp_r50dcnv2_fpnc.py +28 -0
- configs/_base_/det_models/drrg_r50_fpn_unet.py +21 -0
- configs/_base_/det_models/fcenet_r50_fpn.py +33 -0
- configs/_base_/det_models/fcenet_r50dcnv2_fpn.py +35 -0
- configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem.py +126 -0
- configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem_poly.py +126 -0
- configs/_base_/det_models/panet_r18_fpem_ffm.py +43 -0
- configs/_base_/det_models/panet_r50_fpem_ffm.py +21 -0
- configs/_base_/det_models/psenet_r50_fpnf.py +51 -0
- configs/_base_/det_models/textsnake_r50_fpn_unet.py +22 -0
- configs/_base_/det_pipelines/dbnet_pipeline.py +88 -0
- configs/_base_/det_pipelines/drrg_pipeline.py +60 -0
- configs/_base_/det_pipelines/fcenet_pipeline.py +118 -0
- configs/_base_/det_pipelines/maskrcnn_pipeline.py +57 -0
- configs/_base_/det_pipelines/panet_pipeline.py +156 -0
- configs/_base_/det_pipelines/psenet_pipeline.py +70 -0
- configs/_base_/det_pipelines/textsnake_pipeline.py +65 -0
- configs/_base_/recog_datasets/MJ_train.py +21 -0
- configs/_base_/recog_datasets/ST_MJ_alphanumeric_train.py +31 -0
- configs/_base_/recog_datasets/ST_MJ_train.py +29 -0
- configs/_base_/recog_datasets/ST_SA_MJ_real_train.py +81 -0
- configs/_base_/recog_datasets/ST_SA_MJ_train.py +48 -0
- configs/_base_/recog_datasets/ST_charbox_train.py +23 -0
- configs/_base_/recog_datasets/academic_test.py +57 -0
- configs/_base_/recog_datasets/seg_toy_data.py +34 -0
- configs/_base_/recog_datasets/toy_data.py +54 -0
- configs/_base_/recog_models/abinet.py +70 -0
- configs/_base_/recog_models/crnn.py +12 -0
- configs/_base_/recog_models/crnn_tps.py +18 -0
- configs/_base_/recog_models/master.py +61 -0
- configs/_base_/recog_models/nrtr_modality_transform.py +11 -0
- configs/_base_/recog_models/robust_scanner.py +24 -0
- configs/_base_/recog_models/sar.py +24 -0
- configs/_base_/recog_models/satrn.py +11 -0
- configs/_base_/recog_models/seg.py +21 -0
- configs/_base_/recog_pipelines/abinet_pipeline.py +96 -0
- configs/_base_/recog_pipelines/crnn_pipeline.py +35 -0
- configs/_base_/recog_pipelines/crnn_tps_pipeline.py +37 -0
- configs/_base_/recog_pipelines/master_pipeline.py +42 -0
- configs/_base_/recog_pipelines/nrtr_pipeline.py +38 -0
Home.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.set_page_config(page_title='OCR Comparator', layout ="wide")
|
4 |
+
st.image('ocr.png')
|
5 |
+
|
6 |
+
st.write("")
|
7 |
+
|
8 |
+
st.markdown('''#### OCR, or Optical Character Recognition, is a computer vision task, \
|
9 |
+
which includes the detection of text areas, and the recognition of characters.''')
|
10 |
+
st.write("")
|
11 |
+
st.write("")
|
12 |
+
|
13 |
+
st.markdown("##### This app allows you to compare, from a given image, the results of different solutions:")
|
14 |
+
st.markdown("##### *EasyOcr, PaddleOCR, MMOCR, Tesseract*")
|
15 |
+
st.write("")
|
16 |
+
st.write("")
|
17 |
+
st.markdown("👈 Select the **About** page from the sidebar for information on how the app works")
|
18 |
+
|
19 |
+
st.markdown("👈 or directly select the **App** page")
|
README.md
CHANGED
@@ -1,12 +1,11 @@
|
|
1 |
---
|
2 |
title: Streamlit OCR Comparator
|
3 |
-
emoji:
|
4 |
colorFrom: indigo
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.10.0
|
8 |
-
app_file:
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Streamlit OCR Comparator
|
3 |
+
emoji: 📰🔍🔤
|
4 |
colorFrom: indigo
|
5 |
+
colorTo: gray
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.10.0
|
8 |
+
app_file: Home.py
|
9 |
+
tags: [streamlit, ocr]
|
10 |
+
pinned: true
|
11 |
+
---
|
|
configs/_base_/default_runtime.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# yapf:disable
|
2 |
+
log_config = dict(
|
3 |
+
interval=5,
|
4 |
+
hooks=[
|
5 |
+
dict(type='TextLoggerHook')
|
6 |
+
])
|
7 |
+
# yapf:enable
|
8 |
+
dist_params = dict(backend='nccl')
|
9 |
+
log_level = 'INFO'
|
10 |
+
load_from = None
|
11 |
+
resume_from = None
|
12 |
+
workflow = [('train', 1)]
|
13 |
+
|
14 |
+
# disable opencv multithreading to avoid system being overloaded
|
15 |
+
opencv_num_threads = 0
|
16 |
+
# set multi-process start method as `fork` to speed up the training
|
17 |
+
mp_start_method = 'fork'
|
configs/_base_/det_datasets/ctw1500.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset_type = 'IcdarDataset'
|
2 |
+
data_root = 'data/ctw1500'
|
3 |
+
|
4 |
+
train = dict(
|
5 |
+
type=dataset_type,
|
6 |
+
ann_file=f'{data_root}/instances_training.json',
|
7 |
+
img_prefix=f'{data_root}/imgs',
|
8 |
+
pipeline=None)
|
9 |
+
|
10 |
+
test = dict(
|
11 |
+
type=dataset_type,
|
12 |
+
ann_file=f'{data_root}/instances_test.json',
|
13 |
+
img_prefix=f'{data_root}/imgs',
|
14 |
+
pipeline=None)
|
15 |
+
|
16 |
+
train_list = [train]
|
17 |
+
|
18 |
+
test_list = [test]
|
configs/_base_/det_datasets/icdar2015.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset_type = 'IcdarDataset'
|
2 |
+
data_root = 'data/icdar2015'
|
3 |
+
|
4 |
+
train = dict(
|
5 |
+
type=dataset_type,
|
6 |
+
ann_file=f'{data_root}/instances_training.json',
|
7 |
+
img_prefix=f'{data_root}/imgs',
|
8 |
+
pipeline=None)
|
9 |
+
|
10 |
+
test = dict(
|
11 |
+
type=dataset_type,
|
12 |
+
ann_file=f'{data_root}/instances_test.json',
|
13 |
+
img_prefix=f'{data_root}/imgs',
|
14 |
+
pipeline=None)
|
15 |
+
|
16 |
+
train_list = [train]
|
17 |
+
|
18 |
+
test_list = [test]
|
configs/_base_/det_datasets/icdar2017.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset_type = 'IcdarDataset'
|
2 |
+
data_root = 'data/icdar2017'
|
3 |
+
|
4 |
+
train = dict(
|
5 |
+
type=dataset_type,
|
6 |
+
ann_file=f'{data_root}/instances_training.json',
|
7 |
+
img_prefix=f'{data_root}/imgs',
|
8 |
+
pipeline=None)
|
9 |
+
|
10 |
+
test = dict(
|
11 |
+
type=dataset_type,
|
12 |
+
ann_file=f'{data_root}/instances_val.json',
|
13 |
+
img_prefix=f'{data_root}/imgs',
|
14 |
+
pipeline=None)
|
15 |
+
|
16 |
+
train_list = [train]
|
17 |
+
|
18 |
+
test_list = [test]
|
configs/_base_/det_datasets/synthtext.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset_type = 'TextDetDataset'
|
2 |
+
data_root = 'data/synthtext'
|
3 |
+
|
4 |
+
train = dict(
|
5 |
+
type=dataset_type,
|
6 |
+
ann_file=f'{data_root}/instances_training.lmdb',
|
7 |
+
loader=dict(
|
8 |
+
type='AnnFileLoader',
|
9 |
+
repeat=1,
|
10 |
+
file_format='lmdb',
|
11 |
+
parser=dict(
|
12 |
+
type='LineJsonParser',
|
13 |
+
keys=['file_name', 'height', 'width', 'annotations'])),
|
14 |
+
img_prefix=f'{data_root}/imgs',
|
15 |
+
pipeline=None)
|
16 |
+
|
17 |
+
train_list = [train]
|
18 |
+
test_list = [train]
|
configs/_base_/det_datasets/toy_data.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
root = 'tests/data/toy_dataset'
|
2 |
+
|
3 |
+
# dataset with type='TextDetDataset'
|
4 |
+
train1 = dict(
|
5 |
+
type='TextDetDataset',
|
6 |
+
img_prefix=f'{root}/imgs',
|
7 |
+
ann_file=f'{root}/instances_test.txt',
|
8 |
+
loader=dict(
|
9 |
+
type='AnnFileLoader',
|
10 |
+
repeat=4,
|
11 |
+
file_format='txt',
|
12 |
+
parser=dict(
|
13 |
+
type='LineJsonParser',
|
14 |
+
keys=['file_name', 'height', 'width', 'annotations'])),
|
15 |
+
pipeline=None,
|
16 |
+
test_mode=False)
|
17 |
+
|
18 |
+
# dataset with type='IcdarDataset'
|
19 |
+
train2 = dict(
|
20 |
+
type='IcdarDataset',
|
21 |
+
ann_file=f'{root}/instances_test.json',
|
22 |
+
img_prefix=f'{root}/imgs',
|
23 |
+
pipeline=None)
|
24 |
+
|
25 |
+
test = dict(
|
26 |
+
type='TextDetDataset',
|
27 |
+
img_prefix=f'{root}/imgs',
|
28 |
+
ann_file=f'{root}/instances_test.txt',
|
29 |
+
loader=dict(
|
30 |
+
type='AnnFileLoader',
|
31 |
+
repeat=1,
|
32 |
+
file_format='txt',
|
33 |
+
parser=dict(
|
34 |
+
type='LineJsonParser',
|
35 |
+
keys=['file_name', 'height', 'width', 'annotations'])),
|
36 |
+
pipeline=None,
|
37 |
+
test_mode=True)
|
38 |
+
|
39 |
+
train_list = [train1, train2]
|
40 |
+
|
41 |
+
test_list = [test]
|
configs/_base_/det_models/dbnet_r18_fpnc.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model = dict(
|
2 |
+
type='DBNet',
|
3 |
+
backbone=dict(
|
4 |
+
type='mmdet.ResNet',
|
5 |
+
depth=18,
|
6 |
+
num_stages=4,
|
7 |
+
out_indices=(0, 1, 2, 3),
|
8 |
+
frozen_stages=-1,
|
9 |
+
norm_cfg=dict(type='BN', requires_grad=True),
|
10 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18'),
|
11 |
+
norm_eval=False,
|
12 |
+
style='caffe'),
|
13 |
+
neck=dict(
|
14 |
+
type='FPNC', in_channels=[64, 128, 256, 512], lateral_channels=256),
|
15 |
+
bbox_head=dict(
|
16 |
+
type='DBHead',
|
17 |
+
in_channels=256,
|
18 |
+
loss=dict(type='DBLoss', alpha=5.0, beta=10.0, bbce_loss=True),
|
19 |
+
postprocessor=dict(type='DBPostprocessor', text_repr_type='quad')),
|
20 |
+
train_cfg=None,
|
21 |
+
test_cfg=None)
|
configs/_base_/det_models/dbnet_r50dcnv2_fpnc.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model = dict(
|
2 |
+
type='DBNet',
|
3 |
+
backbone=dict(
|
4 |
+
type='mmdet.ResNet',
|
5 |
+
depth=50,
|
6 |
+
num_stages=4,
|
7 |
+
out_indices=(0, 1, 2, 3),
|
8 |
+
frozen_stages=-1,
|
9 |
+
norm_cfg=dict(type='BN', requires_grad=True),
|
10 |
+
norm_eval=False,
|
11 |
+
style='pytorch',
|
12 |
+
dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
|
13 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
|
14 |
+
stage_with_dcn=(False, True, True, True)),
|
15 |
+
neck=dict(
|
16 |
+
type='FPNC', in_channels=[256, 512, 1024, 2048], lateral_channels=256),
|
17 |
+
bbox_head=dict(
|
18 |
+
type='DBHead',
|
19 |
+
in_channels=256,
|
20 |
+
loss=dict(type='DBLoss', alpha=5.0, beta=10.0, bbce_loss=True),
|
21 |
+
postprocessor=dict(type='DBPostprocessor', text_repr_type='quad')),
|
22 |
+
train_cfg=None,
|
23 |
+
test_cfg=None)
|
configs/_base_/det_models/dbnetpp_r50dcnv2_fpnc.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model = dict(
|
2 |
+
type='DBNet',
|
3 |
+
backbone=dict(
|
4 |
+
type='mmdet.ResNet',
|
5 |
+
depth=50,
|
6 |
+
num_stages=4,
|
7 |
+
out_indices=(0, 1, 2, 3),
|
8 |
+
frozen_stages=-1,
|
9 |
+
norm_cfg=dict(type='BN', requires_grad=True),
|
10 |
+
norm_eval=False,
|
11 |
+
style='pytorch',
|
12 |
+
dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
|
13 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
|
14 |
+
stage_with_dcn=(False, True, True, True)),
|
15 |
+
neck=dict(
|
16 |
+
type='FPNC',
|
17 |
+
in_channels=[256, 512, 1024, 2048],
|
18 |
+
lateral_channels=256,
|
19 |
+
asf_cfg=dict(attention_type='ScaleChannelSpatial')),
|
20 |
+
bbox_head=dict(
|
21 |
+
type='DBHead',
|
22 |
+
in_channels=256,
|
23 |
+
loss=dict(type='DBLoss', alpha=5.0, beta=10.0, bbce_loss=True),
|
24 |
+
postprocessor=dict(
|
25 |
+
type='DBPostprocessor', text_repr_type='quad',
|
26 |
+
epsilon_ratio=0.002)),
|
27 |
+
train_cfg=None,
|
28 |
+
test_cfg=None)
|
configs/_base_/det_models/drrg_r50_fpn_unet.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model = dict(
|
2 |
+
type='DRRG',
|
3 |
+
backbone=dict(
|
4 |
+
type='mmdet.ResNet',
|
5 |
+
depth=50,
|
6 |
+
num_stages=4,
|
7 |
+
out_indices=(0, 1, 2, 3),
|
8 |
+
frozen_stages=-1,
|
9 |
+
norm_cfg=dict(type='BN', requires_grad=True),
|
10 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
|
11 |
+
norm_eval=True,
|
12 |
+
style='caffe'),
|
13 |
+
neck=dict(
|
14 |
+
type='FPN_UNet', in_channels=[256, 512, 1024, 2048], out_channels=32),
|
15 |
+
bbox_head=dict(
|
16 |
+
type='DRRGHead',
|
17 |
+
in_channels=32,
|
18 |
+
text_region_thr=0.3,
|
19 |
+
center_region_thr=0.4,
|
20 |
+
loss=dict(type='DRRGLoss'),
|
21 |
+
postprocessor=dict(type='DRRGPostprocessor', link_thr=0.80)))
|
configs/_base_/det_models/fcenet_r50_fpn.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model = dict(
|
2 |
+
type='FCENet',
|
3 |
+
backbone=dict(
|
4 |
+
type='mmdet.ResNet',
|
5 |
+
depth=50,
|
6 |
+
num_stages=4,
|
7 |
+
out_indices=(1, 2, 3),
|
8 |
+
frozen_stages=-1,
|
9 |
+
norm_cfg=dict(type='BN', requires_grad=True),
|
10 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
|
11 |
+
norm_eval=False,
|
12 |
+
style='pytorch'),
|
13 |
+
neck=dict(
|
14 |
+
type='mmdet.FPN',
|
15 |
+
in_channels=[512, 1024, 2048],
|
16 |
+
out_channels=256,
|
17 |
+
add_extra_convs='on_output',
|
18 |
+
num_outs=3,
|
19 |
+
relu_before_extra_convs=True,
|
20 |
+
act_cfg=None),
|
21 |
+
bbox_head=dict(
|
22 |
+
type='FCEHead',
|
23 |
+
in_channels=256,
|
24 |
+
scales=(8, 16, 32),
|
25 |
+
fourier_degree=5,
|
26 |
+
loss=dict(type='FCELoss', num_sample=50),
|
27 |
+
postprocessor=dict(
|
28 |
+
type='FCEPostprocessor',
|
29 |
+
text_repr_type='quad',
|
30 |
+
num_reconstr_points=50,
|
31 |
+
alpha=1.2,
|
32 |
+
beta=1.0,
|
33 |
+
score_thr=0.3)))
|
configs/_base_/det_models/fcenet_r50dcnv2_fpn.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model = dict(
|
2 |
+
type='FCENet',
|
3 |
+
backbone=dict(
|
4 |
+
type='mmdet.ResNet',
|
5 |
+
depth=50,
|
6 |
+
num_stages=4,
|
7 |
+
out_indices=(1, 2, 3),
|
8 |
+
frozen_stages=-1,
|
9 |
+
norm_cfg=dict(type='BN', requires_grad=True),
|
10 |
+
norm_eval=True,
|
11 |
+
style='pytorch',
|
12 |
+
dcn=dict(type='DCNv2', deform_groups=2, fallback_on_stride=False),
|
13 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
|
14 |
+
stage_with_dcn=(False, True, True, True)),
|
15 |
+
neck=dict(
|
16 |
+
type='mmdet.FPN',
|
17 |
+
in_channels=[512, 1024, 2048],
|
18 |
+
out_channels=256,
|
19 |
+
add_extra_convs='on_output',
|
20 |
+
num_outs=3,
|
21 |
+
relu_before_extra_convs=True,
|
22 |
+
act_cfg=None),
|
23 |
+
bbox_head=dict(
|
24 |
+
type='FCEHead',
|
25 |
+
in_channels=256,
|
26 |
+
scales=(8, 16, 32),
|
27 |
+
fourier_degree=5,
|
28 |
+
loss=dict(type='FCELoss', num_sample=50),
|
29 |
+
postprocessor=dict(
|
30 |
+
type='FCEPostprocessor',
|
31 |
+
text_repr_type='poly',
|
32 |
+
num_reconstr_points=50,
|
33 |
+
alpha=1.0,
|
34 |
+
beta=2.0,
|
35 |
+
score_thr=0.3)))
|
configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
model = dict(
|
3 |
+
type='OCRMaskRCNN',
|
4 |
+
backbone=dict(
|
5 |
+
type='mmdet.ResNet',
|
6 |
+
depth=50,
|
7 |
+
num_stages=4,
|
8 |
+
out_indices=(0, 1, 2, 3),
|
9 |
+
frozen_stages=1,
|
10 |
+
norm_cfg=dict(type='BN', requires_grad=True),
|
11 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
|
12 |
+
norm_eval=True,
|
13 |
+
style='pytorch'),
|
14 |
+
neck=dict(
|
15 |
+
type='mmdet.FPN',
|
16 |
+
in_channels=[256, 512, 1024, 2048],
|
17 |
+
out_channels=256,
|
18 |
+
num_outs=5),
|
19 |
+
rpn_head=dict(
|
20 |
+
type='RPNHead',
|
21 |
+
in_channels=256,
|
22 |
+
feat_channels=256,
|
23 |
+
anchor_generator=dict(
|
24 |
+
type='AnchorGenerator',
|
25 |
+
scales=[4],
|
26 |
+
ratios=[0.17, 0.44, 1.13, 2.90, 7.46],
|
27 |
+
strides=[4, 8, 16, 32, 64]),
|
28 |
+
bbox_coder=dict(
|
29 |
+
type='DeltaXYWHBBoxCoder',
|
30 |
+
target_means=[.0, .0, .0, .0],
|
31 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
32 |
+
loss_cls=dict(
|
33 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
34 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
|
35 |
+
roi_head=dict(
|
36 |
+
type='StandardRoIHead',
|
37 |
+
bbox_roi_extractor=dict(
|
38 |
+
type='SingleRoIExtractor',
|
39 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
|
40 |
+
out_channels=256,
|
41 |
+
featmap_strides=[4, 8, 16, 32]),
|
42 |
+
bbox_head=dict(
|
43 |
+
type='Shared2FCBBoxHead',
|
44 |
+
in_channels=256,
|
45 |
+
fc_out_channels=1024,
|
46 |
+
roi_feat_size=7,
|
47 |
+
num_classes=1,
|
48 |
+
bbox_coder=dict(
|
49 |
+
type='DeltaXYWHBBoxCoder',
|
50 |
+
target_means=[0., 0., 0., 0.],
|
51 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
52 |
+
reg_class_agnostic=False,
|
53 |
+
loss_cls=dict(
|
54 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
55 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
|
56 |
+
mask_roi_extractor=dict(
|
57 |
+
type='SingleRoIExtractor',
|
58 |
+
roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
|
59 |
+
out_channels=256,
|
60 |
+
featmap_strides=[4, 8, 16, 32]),
|
61 |
+
mask_head=dict(
|
62 |
+
type='FCNMaskHead',
|
63 |
+
num_convs=4,
|
64 |
+
in_channels=256,
|
65 |
+
conv_out_channels=256,
|
66 |
+
num_classes=1,
|
67 |
+
loss_mask=dict(
|
68 |
+
type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
|
69 |
+
|
70 |
+
# model training and testing settings
|
71 |
+
train_cfg=dict(
|
72 |
+
rpn=dict(
|
73 |
+
assigner=dict(
|
74 |
+
type='MaxIoUAssigner',
|
75 |
+
pos_iou_thr=0.7,
|
76 |
+
neg_iou_thr=0.3,
|
77 |
+
min_pos_iou=0.3,
|
78 |
+
match_low_quality=True,
|
79 |
+
ignore_iof_thr=-1,
|
80 |
+
gpu_assign_thr=50),
|
81 |
+
sampler=dict(
|
82 |
+
type='RandomSampler',
|
83 |
+
num=256,
|
84 |
+
pos_fraction=0.5,
|
85 |
+
neg_pos_ub=-1,
|
86 |
+
add_gt_as_proposals=False),
|
87 |
+
allowed_border=-1,
|
88 |
+
pos_weight=-1,
|
89 |
+
debug=False),
|
90 |
+
rpn_proposal=dict(
|
91 |
+
nms_across_levels=False,
|
92 |
+
nms_pre=2000,
|
93 |
+
nms_post=1000,
|
94 |
+
max_per_img=1000,
|
95 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
96 |
+
min_bbox_size=0),
|
97 |
+
rcnn=dict(
|
98 |
+
assigner=dict(
|
99 |
+
type='MaxIoUAssigner',
|
100 |
+
pos_iou_thr=0.5,
|
101 |
+
neg_iou_thr=0.5,
|
102 |
+
min_pos_iou=0.5,
|
103 |
+
match_low_quality=True,
|
104 |
+
ignore_iof_thr=-1),
|
105 |
+
sampler=dict(
|
106 |
+
type='OHEMSampler',
|
107 |
+
num=512,
|
108 |
+
pos_fraction=0.25,
|
109 |
+
neg_pos_ub=-1,
|
110 |
+
add_gt_as_proposals=True),
|
111 |
+
mask_size=28,
|
112 |
+
pos_weight=-1,
|
113 |
+
debug=False)),
|
114 |
+
test_cfg=dict(
|
115 |
+
rpn=dict(
|
116 |
+
nms_across_levels=False,
|
117 |
+
nms_pre=1000,
|
118 |
+
nms_post=1000,
|
119 |
+
max_per_img=1000,
|
120 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
121 |
+
min_bbox_size=0),
|
122 |
+
rcnn=dict(
|
123 |
+
score_thr=0.05,
|
124 |
+
nms=dict(type='nms', iou_threshold=0.5),
|
125 |
+
max_per_img=100,
|
126 |
+
mask_thr_binary=0.5)))
|
configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem_poly.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
model = dict(
|
3 |
+
type='OCRMaskRCNN',
|
4 |
+
text_repr_type='poly',
|
5 |
+
backbone=dict(
|
6 |
+
type='mmdet.ResNet',
|
7 |
+
depth=50,
|
8 |
+
num_stages=4,
|
9 |
+
out_indices=(0, 1, 2, 3),
|
10 |
+
frozen_stages=1,
|
11 |
+
norm_cfg=dict(type='BN', requires_grad=True),
|
12 |
+
norm_eval=True,
|
13 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
|
14 |
+
style='pytorch'),
|
15 |
+
neck=dict(
|
16 |
+
type='mmdet.FPN',
|
17 |
+
in_channels=[256, 512, 1024, 2048],
|
18 |
+
out_channels=256,
|
19 |
+
num_outs=5),
|
20 |
+
rpn_head=dict(
|
21 |
+
type='RPNHead',
|
22 |
+
in_channels=256,
|
23 |
+
feat_channels=256,
|
24 |
+
anchor_generator=dict(
|
25 |
+
type='AnchorGenerator',
|
26 |
+
scales=[4],
|
27 |
+
ratios=[0.17, 0.44, 1.13, 2.90, 7.46],
|
28 |
+
strides=[4, 8, 16, 32, 64]),
|
29 |
+
bbox_coder=dict(
|
30 |
+
type='DeltaXYWHBBoxCoder',
|
31 |
+
target_means=[.0, .0, .0, .0],
|
32 |
+
target_stds=[1.0, 1.0, 1.0, 1.0]),
|
33 |
+
loss_cls=dict(
|
34 |
+
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
|
35 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
|
36 |
+
roi_head=dict(
|
37 |
+
type='StandardRoIHead',
|
38 |
+
bbox_roi_extractor=dict(
|
39 |
+
type='SingleRoIExtractor',
|
40 |
+
roi_layer=dict(type='RoIAlign', output_size=7, sample_num=0),
|
41 |
+
out_channels=256,
|
42 |
+
featmap_strides=[4, 8, 16, 32]),
|
43 |
+
bbox_head=dict(
|
44 |
+
type='Shared2FCBBoxHead',
|
45 |
+
in_channels=256,
|
46 |
+
fc_out_channels=1024,
|
47 |
+
roi_feat_size=7,
|
48 |
+
num_classes=80,
|
49 |
+
bbox_coder=dict(
|
50 |
+
type='DeltaXYWHBBoxCoder',
|
51 |
+
target_means=[0., 0., 0., 0.],
|
52 |
+
target_stds=[0.1, 0.1, 0.2, 0.2]),
|
53 |
+
reg_class_agnostic=False,
|
54 |
+
loss_cls=dict(
|
55 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
|
56 |
+
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
|
57 |
+
mask_roi_extractor=dict(
|
58 |
+
type='SingleRoIExtractor',
|
59 |
+
roi_layer=dict(type='RoIAlign', output_size=14, sample_num=0),
|
60 |
+
out_channels=256,
|
61 |
+
featmap_strides=[4, 8, 16, 32]),
|
62 |
+
mask_head=dict(
|
63 |
+
type='FCNMaskHead',
|
64 |
+
num_convs=4,
|
65 |
+
in_channels=256,
|
66 |
+
conv_out_channels=256,
|
67 |
+
num_classes=80,
|
68 |
+
loss_mask=dict(
|
69 |
+
type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
|
70 |
+
# model training and testing settings
|
71 |
+
train_cfg=dict(
|
72 |
+
rpn=dict(
|
73 |
+
assigner=dict(
|
74 |
+
type='MaxIoUAssigner',
|
75 |
+
pos_iou_thr=0.7,
|
76 |
+
neg_iou_thr=0.3,
|
77 |
+
min_pos_iou=0.3,
|
78 |
+
match_low_quality=True,
|
79 |
+
ignore_iof_thr=-1),
|
80 |
+
sampler=dict(
|
81 |
+
type='RandomSampler',
|
82 |
+
num=256,
|
83 |
+
pos_fraction=0.5,
|
84 |
+
neg_pos_ub=-1,
|
85 |
+
add_gt_as_proposals=False),
|
86 |
+
allowed_border=-1,
|
87 |
+
pos_weight=-1,
|
88 |
+
debug=False),
|
89 |
+
rpn_proposal=dict(
|
90 |
+
nms_across_levels=False,
|
91 |
+
nms_pre=2000,
|
92 |
+
nms_post=1000,
|
93 |
+
max_per_img=1000,
|
94 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
95 |
+
min_bbox_size=0),
|
96 |
+
rcnn=dict(
|
97 |
+
assigner=dict(
|
98 |
+
type='MaxIoUAssigner',
|
99 |
+
pos_iou_thr=0.5,
|
100 |
+
neg_iou_thr=0.5,
|
101 |
+
min_pos_iou=0.5,
|
102 |
+
match_low_quality=True,
|
103 |
+
ignore_iof_thr=-1,
|
104 |
+
gpu_assign_thr=50),
|
105 |
+
sampler=dict(
|
106 |
+
type='OHEMSampler',
|
107 |
+
num=512,
|
108 |
+
pos_fraction=0.25,
|
109 |
+
neg_pos_ub=-1,
|
110 |
+
add_gt_as_proposals=True),
|
111 |
+
mask_size=28,
|
112 |
+
pos_weight=-1,
|
113 |
+
debug=False)),
|
114 |
+
test_cfg=dict(
|
115 |
+
rpn=dict(
|
116 |
+
nms_across_levels=False,
|
117 |
+
nms_pre=1000,
|
118 |
+
nms_post=1000,
|
119 |
+
max_per_img=1000,
|
120 |
+
nms=dict(type='nms', iou_threshold=0.7),
|
121 |
+
min_bbox_size=0),
|
122 |
+
rcnn=dict(
|
123 |
+
score_thr=0.05,
|
124 |
+
nms=dict(type='nms', iou_threshold=0.5),
|
125 |
+
max_per_img=100,
|
126 |
+
mask_thr_binary=0.5)))
|
configs/_base_/det_models/panet_r18_fpem_ffm.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model_poly = dict(
|
2 |
+
type='PANet',
|
3 |
+
backbone=dict(
|
4 |
+
type='mmdet.ResNet',
|
5 |
+
depth=18,
|
6 |
+
num_stages=4,
|
7 |
+
out_indices=(0, 1, 2, 3),
|
8 |
+
frozen_stages=-1,
|
9 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
10 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18'),
|
11 |
+
norm_eval=True,
|
12 |
+
style='caffe'),
|
13 |
+
neck=dict(type='FPEM_FFM', in_channels=[64, 128, 256, 512]),
|
14 |
+
bbox_head=dict(
|
15 |
+
type='PANHead',
|
16 |
+
in_channels=[128, 128, 128, 128],
|
17 |
+
out_channels=6,
|
18 |
+
loss=dict(type='PANLoss'),
|
19 |
+
postprocessor=dict(type='PANPostprocessor', text_repr_type='poly')),
|
20 |
+
train_cfg=None,
|
21 |
+
test_cfg=None)
|
22 |
+
|
23 |
+
model_quad = dict(
|
24 |
+
type='PANet',
|
25 |
+
backbone=dict(
|
26 |
+
type='mmdet.ResNet',
|
27 |
+
depth=18,
|
28 |
+
num_stages=4,
|
29 |
+
out_indices=(0, 1, 2, 3),
|
30 |
+
frozen_stages=-1,
|
31 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
32 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18'),
|
33 |
+
norm_eval=True,
|
34 |
+
style='caffe'),
|
35 |
+
neck=dict(type='FPEM_FFM', in_channels=[64, 128, 256, 512]),
|
36 |
+
bbox_head=dict(
|
37 |
+
type='PANHead',
|
38 |
+
in_channels=[128, 128, 128, 128],
|
39 |
+
out_channels=6,
|
40 |
+
loss=dict(type='PANLoss'),
|
41 |
+
postprocessor=dict(type='PANPostprocessor', text_repr_type='quad')),
|
42 |
+
train_cfg=None,
|
43 |
+
test_cfg=None)
|
configs/_base_/det_models/panet_r50_fpem_ffm.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model = dict(
|
2 |
+
type='PANet',
|
3 |
+
pretrained='torchvision://resnet50',
|
4 |
+
backbone=dict(
|
5 |
+
type='mmdet.ResNet',
|
6 |
+
depth=50,
|
7 |
+
num_stages=4,
|
8 |
+
out_indices=(0, 1, 2, 3),
|
9 |
+
frozen_stages=1,
|
10 |
+
norm_cfg=dict(type='BN', requires_grad=True),
|
11 |
+
norm_eval=True,
|
12 |
+
style='caffe'),
|
13 |
+
neck=dict(type='FPEM_FFM', in_channels=[256, 512, 1024, 2048]),
|
14 |
+
bbox_head=dict(
|
15 |
+
type='PANHead',
|
16 |
+
in_channels=[128, 128, 128, 128],
|
17 |
+
out_channels=6,
|
18 |
+
loss=dict(type='PANLoss', speedup_bbox_thr=32),
|
19 |
+
postprocessor=dict(type='PANPostprocessor', text_repr_type='poly')),
|
20 |
+
train_cfg=None,
|
21 |
+
test_cfg=None)
|
configs/_base_/det_models/psenet_r50_fpnf.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model_poly = dict(
|
2 |
+
type='PSENet',
|
3 |
+
backbone=dict(
|
4 |
+
type='mmdet.ResNet',
|
5 |
+
depth=50,
|
6 |
+
num_stages=4,
|
7 |
+
out_indices=(0, 1, 2, 3),
|
8 |
+
frozen_stages=-1,
|
9 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
10 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
|
11 |
+
norm_eval=True,
|
12 |
+
style='caffe'),
|
13 |
+
neck=dict(
|
14 |
+
type='FPNF',
|
15 |
+
in_channels=[256, 512, 1024, 2048],
|
16 |
+
out_channels=256,
|
17 |
+
fusion_type='concat'),
|
18 |
+
bbox_head=dict(
|
19 |
+
type='PSEHead',
|
20 |
+
in_channels=[256],
|
21 |
+
out_channels=7,
|
22 |
+
loss=dict(type='PSELoss'),
|
23 |
+
postprocessor=dict(type='PSEPostprocessor', text_repr_type='poly')),
|
24 |
+
train_cfg=None,
|
25 |
+
test_cfg=None)
|
26 |
+
|
27 |
+
model_quad = dict(
|
28 |
+
type='PSENet',
|
29 |
+
backbone=dict(
|
30 |
+
type='mmdet.ResNet',
|
31 |
+
depth=50,
|
32 |
+
num_stages=4,
|
33 |
+
out_indices=(0, 1, 2, 3),
|
34 |
+
frozen_stages=-1,
|
35 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
36 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
|
37 |
+
norm_eval=True,
|
38 |
+
style='caffe'),
|
39 |
+
neck=dict(
|
40 |
+
type='FPNF',
|
41 |
+
in_channels=[256, 512, 1024, 2048],
|
42 |
+
out_channels=256,
|
43 |
+
fusion_type='concat'),
|
44 |
+
bbox_head=dict(
|
45 |
+
type='PSEHead',
|
46 |
+
in_channels=[256],
|
47 |
+
out_channels=7,
|
48 |
+
loss=dict(type='PSELoss'),
|
49 |
+
postprocessor=dict(type='PSEPostprocessor', text_repr_type='quad')),
|
50 |
+
train_cfg=None,
|
51 |
+
test_cfg=None)
|
configs/_base_/det_models/textsnake_r50_fpn_unet.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model = dict(
|
2 |
+
type='TextSnake',
|
3 |
+
backbone=dict(
|
4 |
+
type='mmdet.ResNet',
|
5 |
+
depth=50,
|
6 |
+
num_stages=4,
|
7 |
+
out_indices=(0, 1, 2, 3),
|
8 |
+
frozen_stages=-1,
|
9 |
+
norm_cfg=dict(type='BN', requires_grad=True),
|
10 |
+
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
|
11 |
+
norm_eval=True,
|
12 |
+
style='caffe'),
|
13 |
+
neck=dict(
|
14 |
+
type='FPN_UNet', in_channels=[256, 512, 1024, 2048], out_channels=32),
|
15 |
+
bbox_head=dict(
|
16 |
+
type='TextSnakeHead',
|
17 |
+
in_channels=32,
|
18 |
+
loss=dict(type='TextSnakeLoss'),
|
19 |
+
postprocessor=dict(
|
20 |
+
type='TextSnakePostprocessor', text_repr_type='poly')),
|
21 |
+
train_cfg=None,
|
22 |
+
test_cfg=None)
|
configs/_base_/det_pipelines/dbnet_pipeline.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
img_norm_cfg = dict(
|
2 |
+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
3 |
+
|
4 |
+
train_pipeline_r18 = [
|
5 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
6 |
+
dict(
|
7 |
+
type='LoadTextAnnotations',
|
8 |
+
with_bbox=True,
|
9 |
+
with_mask=True,
|
10 |
+
poly2mask=False),
|
11 |
+
dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
|
12 |
+
dict(type='Normalize', **img_norm_cfg),
|
13 |
+
dict(
|
14 |
+
type='ImgAug',
|
15 |
+
args=[['Fliplr', 0.5],
|
16 |
+
dict(cls='Affine', rotate=[-10, 10]), ['Resize', [0.5, 3.0]]]),
|
17 |
+
dict(type='EastRandomCrop', target_size=(640, 640)),
|
18 |
+
dict(type='DBNetTargets', shrink_ratio=0.4),
|
19 |
+
dict(type='Pad', size_divisor=32),
|
20 |
+
dict(
|
21 |
+
type='CustomFormatBundle',
|
22 |
+
keys=['gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'],
|
23 |
+
visualize=dict(flag=False, boundary_key='gt_shrink')),
|
24 |
+
dict(
|
25 |
+
type='Collect',
|
26 |
+
keys=['img', 'gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'])
|
27 |
+
]
|
28 |
+
|
29 |
+
test_pipeline_1333_736 = [
|
30 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
31 |
+
dict(
|
32 |
+
type='MultiScaleFlipAug',
|
33 |
+
img_scale=(1333, 736), # used by Resize
|
34 |
+
flip=False,
|
35 |
+
transforms=[
|
36 |
+
dict(type='Resize', keep_ratio=True),
|
37 |
+
dict(type='Normalize', **img_norm_cfg),
|
38 |
+
dict(type='Pad', size_divisor=32),
|
39 |
+
dict(type='ImageToTensor', keys=['img']),
|
40 |
+
dict(type='Collect', keys=['img']),
|
41 |
+
])
|
42 |
+
]
|
43 |
+
|
44 |
+
# for dbnet_r50dcnv2_fpnc
|
45 |
+
img_norm_cfg_r50dcnv2 = dict(
|
46 |
+
mean=[122.67891434, 116.66876762, 104.00698793],
|
47 |
+
std=[58.395, 57.12, 57.375],
|
48 |
+
to_rgb=True)
|
49 |
+
|
50 |
+
train_pipeline_r50dcnv2 = [
|
51 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
52 |
+
dict(
|
53 |
+
type='LoadTextAnnotations',
|
54 |
+
with_bbox=True,
|
55 |
+
with_mask=True,
|
56 |
+
poly2mask=False),
|
57 |
+
dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
|
58 |
+
dict(type='Normalize', **img_norm_cfg_r50dcnv2),
|
59 |
+
dict(
|
60 |
+
type='ImgAug',
|
61 |
+
args=[['Fliplr', 0.5],
|
62 |
+
dict(cls='Affine', rotate=[-10, 10]), ['Resize', [0.5, 3.0]]]),
|
63 |
+
dict(type='EastRandomCrop', target_size=(640, 640)),
|
64 |
+
dict(type='DBNetTargets', shrink_ratio=0.4),
|
65 |
+
dict(type='Pad', size_divisor=32),
|
66 |
+
dict(
|
67 |
+
type='CustomFormatBundle',
|
68 |
+
keys=['gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'],
|
69 |
+
visualize=dict(flag=False, boundary_key='gt_shrink')),
|
70 |
+
dict(
|
71 |
+
type='Collect',
|
72 |
+
keys=['img', 'gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'])
|
73 |
+
]
|
74 |
+
|
75 |
+
test_pipeline_4068_1024 = [
|
76 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
77 |
+
dict(
|
78 |
+
type='MultiScaleFlipAug',
|
79 |
+
img_scale=(4068, 1024), # used by Resize
|
80 |
+
flip=False,
|
81 |
+
transforms=[
|
82 |
+
dict(type='Resize', keep_ratio=True),
|
83 |
+
dict(type='Normalize', **img_norm_cfg_r50dcnv2),
|
84 |
+
dict(type='Pad', size_divisor=32),
|
85 |
+
dict(type='ImageToTensor', keys=['img']),
|
86 |
+
dict(type='Collect', keys=['img']),
|
87 |
+
])
|
88 |
+
]
|
configs/_base_/det_pipelines/drrg_pipeline.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
img_norm_cfg = dict(
|
2 |
+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
3 |
+
|
4 |
+
train_pipeline = [
|
5 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
6 |
+
dict(
|
7 |
+
type='LoadTextAnnotations',
|
8 |
+
with_bbox=True,
|
9 |
+
with_mask=True,
|
10 |
+
poly2mask=False),
|
11 |
+
dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
|
12 |
+
dict(type='Normalize', **img_norm_cfg),
|
13 |
+
dict(type='RandomScaling', size=800, scale=(0.75, 2.5)),
|
14 |
+
dict(
|
15 |
+
type='RandomCropFlip', crop_ratio=0.5, iter_num=1, min_area_ratio=0.2),
|
16 |
+
dict(
|
17 |
+
type='RandomCropPolyInstances',
|
18 |
+
instance_key='gt_masks',
|
19 |
+
crop_ratio=0.8,
|
20 |
+
min_side_ratio=0.3),
|
21 |
+
dict(
|
22 |
+
type='RandomRotatePolyInstances',
|
23 |
+
rotate_ratio=0.5,
|
24 |
+
max_angle=60,
|
25 |
+
pad_with_fixed_color=False),
|
26 |
+
dict(type='SquareResizePad', target_size=800, pad_ratio=0.6),
|
27 |
+
dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
|
28 |
+
dict(type='DRRGTargets'),
|
29 |
+
dict(type='Pad', size_divisor=32),
|
30 |
+
dict(
|
31 |
+
type='CustomFormatBundle',
|
32 |
+
keys=[
|
33 |
+
'gt_text_mask', 'gt_center_region_mask', 'gt_mask',
|
34 |
+
'gt_top_height_map', 'gt_bot_height_map', 'gt_sin_map',
|
35 |
+
'gt_cos_map', 'gt_comp_attribs'
|
36 |
+
],
|
37 |
+
visualize=dict(flag=False, boundary_key='gt_text_mask')),
|
38 |
+
dict(
|
39 |
+
type='Collect',
|
40 |
+
keys=[
|
41 |
+
'img', 'gt_text_mask', 'gt_center_region_mask', 'gt_mask',
|
42 |
+
'gt_top_height_map', 'gt_bot_height_map', 'gt_sin_map',
|
43 |
+
'gt_cos_map', 'gt_comp_attribs'
|
44 |
+
])
|
45 |
+
]
|
46 |
+
|
47 |
+
test_pipeline = [
|
48 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
49 |
+
dict(
|
50 |
+
type='MultiScaleFlipAug',
|
51 |
+
img_scale=(1024, 640), # used by Resize
|
52 |
+
flip=False,
|
53 |
+
transforms=[
|
54 |
+
dict(type='Resize', keep_ratio=True),
|
55 |
+
dict(type='Normalize', **img_norm_cfg),
|
56 |
+
dict(type='Pad', size_divisor=32),
|
57 |
+
dict(type='ImageToTensor', keys=['img']),
|
58 |
+
dict(type='Collect', keys=['img']),
|
59 |
+
])
|
60 |
+
]
|
configs/_base_/det_pipelines/fcenet_pipeline.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
img_norm_cfg = dict(
|
2 |
+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
3 |
+
|
4 |
+
# for icdar2015
|
5 |
+
leval_prop_range_icdar2015 = ((0, 0.4), (0.3, 0.7), (0.6, 1.0))
|
6 |
+
train_pipeline_icdar2015 = [
|
7 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
8 |
+
dict(
|
9 |
+
type='LoadTextAnnotations',
|
10 |
+
with_bbox=True,
|
11 |
+
with_mask=True,
|
12 |
+
poly2mask=False),
|
13 |
+
dict(
|
14 |
+
type='ColorJitter',
|
15 |
+
brightness=32.0 / 255,
|
16 |
+
saturation=0.5,
|
17 |
+
contrast=0.5),
|
18 |
+
dict(type='Normalize', **img_norm_cfg),
|
19 |
+
dict(type='RandomScaling', size=800, scale=(3. / 4, 5. / 2)),
|
20 |
+
dict(
|
21 |
+
type='RandomCropFlip', crop_ratio=0.5, iter_num=1, min_area_ratio=0.2),
|
22 |
+
dict(
|
23 |
+
type='RandomCropPolyInstances',
|
24 |
+
instance_key='gt_masks',
|
25 |
+
crop_ratio=0.8,
|
26 |
+
min_side_ratio=0.3),
|
27 |
+
dict(
|
28 |
+
type='RandomRotatePolyInstances',
|
29 |
+
rotate_ratio=0.5,
|
30 |
+
max_angle=30,
|
31 |
+
pad_with_fixed_color=False),
|
32 |
+
dict(type='SquareResizePad', target_size=800, pad_ratio=0.6),
|
33 |
+
dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
|
34 |
+
dict(type='Pad', size_divisor=32),
|
35 |
+
dict(
|
36 |
+
type='FCENetTargets',
|
37 |
+
fourier_degree=5,
|
38 |
+
level_proportion_range=leval_prop_range_icdar2015),
|
39 |
+
dict(
|
40 |
+
type='CustomFormatBundle',
|
41 |
+
keys=['p3_maps', 'p4_maps', 'p5_maps'],
|
42 |
+
visualize=dict(flag=False, boundary_key=None)),
|
43 |
+
dict(type='Collect', keys=['img', 'p3_maps', 'p4_maps', 'p5_maps'])
|
44 |
+
]
|
45 |
+
|
46 |
+
img_scale_icdar2015 = (2260, 2260)
|
47 |
+
test_pipeline_icdar2015 = [
|
48 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
49 |
+
dict(
|
50 |
+
type='MultiScaleFlipAug',
|
51 |
+
img_scale=img_scale_icdar2015, # used by Resize
|
52 |
+
flip=False,
|
53 |
+
transforms=[
|
54 |
+
dict(type='Resize', keep_ratio=True),
|
55 |
+
dict(type='Normalize', **img_norm_cfg),
|
56 |
+
dict(type='Pad', size_divisor=32),
|
57 |
+
dict(type='ImageToTensor', keys=['img']),
|
58 |
+
dict(type='Collect', keys=['img']),
|
59 |
+
])
|
60 |
+
]
|
61 |
+
|
62 |
+
# for ctw1500
|
63 |
+
leval_prop_range_ctw1500 = ((0, 0.25), (0.2, 0.65), (0.55, 1.0))
|
64 |
+
train_pipeline_ctw1500 = [
|
65 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
66 |
+
dict(
|
67 |
+
type='LoadTextAnnotations',
|
68 |
+
with_bbox=True,
|
69 |
+
with_mask=True,
|
70 |
+
poly2mask=False),
|
71 |
+
dict(
|
72 |
+
type='ColorJitter',
|
73 |
+
brightness=32.0 / 255,
|
74 |
+
saturation=0.5,
|
75 |
+
contrast=0.5),
|
76 |
+
dict(type='Normalize', **img_norm_cfg),
|
77 |
+
dict(type='RandomScaling', size=800, scale=(3. / 4, 5. / 2)),
|
78 |
+
dict(
|
79 |
+
type='RandomCropFlip', crop_ratio=0.5, iter_num=1, min_area_ratio=0.2),
|
80 |
+
dict(
|
81 |
+
type='RandomCropPolyInstances',
|
82 |
+
instance_key='gt_masks',
|
83 |
+
crop_ratio=0.8,
|
84 |
+
min_side_ratio=0.3),
|
85 |
+
dict(
|
86 |
+
type='RandomRotatePolyInstances',
|
87 |
+
rotate_ratio=0.5,
|
88 |
+
max_angle=30,
|
89 |
+
pad_with_fixed_color=False),
|
90 |
+
dict(type='SquareResizePad', target_size=800, pad_ratio=0.6),
|
91 |
+
dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
|
92 |
+
dict(type='Pad', size_divisor=32),
|
93 |
+
dict(
|
94 |
+
type='FCENetTargets',
|
95 |
+
fourier_degree=5,
|
96 |
+
level_proportion_range=leval_prop_range_ctw1500),
|
97 |
+
dict(
|
98 |
+
type='CustomFormatBundle',
|
99 |
+
keys=['p3_maps', 'p4_maps', 'p5_maps'],
|
100 |
+
visualize=dict(flag=False, boundary_key=None)),
|
101 |
+
dict(type='Collect', keys=['img', 'p3_maps', 'p4_maps', 'p5_maps'])
|
102 |
+
]
|
103 |
+
|
104 |
+
img_scale_ctw1500 = (1080, 736)
|
105 |
+
test_pipeline_ctw1500 = [
|
106 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
107 |
+
dict(
|
108 |
+
type='MultiScaleFlipAug',
|
109 |
+
img_scale=img_scale_ctw1500, # used by Resize
|
110 |
+
flip=False,
|
111 |
+
transforms=[
|
112 |
+
dict(type='Resize', keep_ratio=True),
|
113 |
+
dict(type='Normalize', **img_norm_cfg),
|
114 |
+
dict(type='Pad', size_divisor=32),
|
115 |
+
dict(type='ImageToTensor', keys=['img']),
|
116 |
+
dict(type='Collect', keys=['img']),
|
117 |
+
])
|
118 |
+
]
|
configs/_base_/det_pipelines/maskrcnn_pipeline.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
img_norm_cfg = dict(
|
2 |
+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
3 |
+
|
4 |
+
train_pipeline = [
|
5 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
6 |
+
dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
|
7 |
+
dict(
|
8 |
+
type='ScaleAspectJitter',
|
9 |
+
img_scale=None,
|
10 |
+
keep_ratio=False,
|
11 |
+
resize_type='indep_sample_in_range',
|
12 |
+
scale_range=(640, 2560)),
|
13 |
+
dict(type='RandomFlip', flip_ratio=0.5),
|
14 |
+
dict(type='Normalize', **img_norm_cfg),
|
15 |
+
dict(
|
16 |
+
type='RandomCropInstances',
|
17 |
+
target_size=(640, 640),
|
18 |
+
mask_type='union_all',
|
19 |
+
instance_key='gt_masks'),
|
20 |
+
dict(type='Pad', size_divisor=32),
|
21 |
+
dict(type='DefaultFormatBundle'),
|
22 |
+
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
|
23 |
+
]
|
24 |
+
|
25 |
+
# for ctw1500
|
26 |
+
img_scale_ctw1500 = (1600, 1600)
|
27 |
+
test_pipeline_ctw1500 = [
|
28 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
29 |
+
dict(
|
30 |
+
type='MultiScaleFlipAug',
|
31 |
+
img_scale=img_scale_ctw1500, # used by Resize
|
32 |
+
flip=False,
|
33 |
+
transforms=[
|
34 |
+
dict(type='Resize', keep_ratio=True),
|
35 |
+
dict(type='RandomFlip'),
|
36 |
+
dict(type='Normalize', **img_norm_cfg),
|
37 |
+
dict(type='ImageToTensor', keys=['img']),
|
38 |
+
dict(type='Collect', keys=['img']),
|
39 |
+
])
|
40 |
+
]
|
41 |
+
|
42 |
+
# for icdar2015
|
43 |
+
img_scale_icdar2015 = (1920, 1920)
|
44 |
+
test_pipeline_icdar2015 = [
|
45 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
46 |
+
dict(
|
47 |
+
type='MultiScaleFlipAug',
|
48 |
+
img_scale=img_scale_icdar2015, # used by Resize
|
49 |
+
flip=False,
|
50 |
+
transforms=[
|
51 |
+
dict(type='Resize', keep_ratio=True),
|
52 |
+
dict(type='RandomFlip'),
|
53 |
+
dict(type='Normalize', **img_norm_cfg),
|
54 |
+
dict(type='ImageToTensor', keys=['img']),
|
55 |
+
dict(type='Collect', keys=['img']),
|
56 |
+
])
|
57 |
+
]
|
configs/_base_/det_pipelines/panet_pipeline.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
img_norm_cfg = dict(
|
2 |
+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
3 |
+
|
4 |
+
# for ctw1500
|
5 |
+
img_scale_train_ctw1500 = [(3000, 640)]
|
6 |
+
shrink_ratio_train_ctw1500 = (1.0, 0.7)
|
7 |
+
target_size_train_ctw1500 = (640, 640)
|
8 |
+
train_pipeline_ctw1500 = [
|
9 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
10 |
+
dict(
|
11 |
+
type='LoadTextAnnotations',
|
12 |
+
with_bbox=True,
|
13 |
+
with_mask=True,
|
14 |
+
poly2mask=False),
|
15 |
+
dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
|
16 |
+
dict(type='Normalize', **img_norm_cfg),
|
17 |
+
dict(
|
18 |
+
type='ScaleAspectJitter',
|
19 |
+
img_scale=img_scale_train_ctw1500,
|
20 |
+
ratio_range=(0.7, 1.3),
|
21 |
+
aspect_ratio_range=(0.9, 1.1),
|
22 |
+
multiscale_mode='value',
|
23 |
+
keep_ratio=False),
|
24 |
+
# shrink_ratio is from big to small. The 1st must be 1.0
|
25 |
+
dict(type='PANetTargets', shrink_ratio=shrink_ratio_train_ctw1500),
|
26 |
+
dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
|
27 |
+
dict(type='RandomRotateTextDet'),
|
28 |
+
dict(
|
29 |
+
type='RandomCropInstances',
|
30 |
+
target_size=target_size_train_ctw1500,
|
31 |
+
instance_key='gt_kernels'),
|
32 |
+
dict(type='Pad', size_divisor=32),
|
33 |
+
dict(
|
34 |
+
type='CustomFormatBundle',
|
35 |
+
keys=['gt_kernels', 'gt_mask'],
|
36 |
+
visualize=dict(flag=False, boundary_key='gt_kernels')),
|
37 |
+
dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
|
38 |
+
]
|
39 |
+
|
40 |
+
img_scale_test_ctw1500 = (3000, 640)
|
41 |
+
test_pipeline_ctw1500 = [
|
42 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
43 |
+
dict(
|
44 |
+
type='MultiScaleFlipAug',
|
45 |
+
img_scale=img_scale_test_ctw1500, # used by Resize
|
46 |
+
flip=False,
|
47 |
+
transforms=[
|
48 |
+
dict(type='Resize', keep_ratio=True),
|
49 |
+
dict(type='Normalize', **img_norm_cfg),
|
50 |
+
dict(type='Pad', size_divisor=32),
|
51 |
+
dict(type='ImageToTensor', keys=['img']),
|
52 |
+
dict(type='Collect', keys=['img']),
|
53 |
+
])
|
54 |
+
]
|
55 |
+
|
56 |
+
# for icdar2015
|
57 |
+
img_scale_train_icdar2015 = [(3000, 736)]
|
58 |
+
shrink_ratio_train_icdar2015 = (1.0, 0.5)
|
59 |
+
target_size_train_icdar2015 = (736, 736)
|
60 |
+
train_pipeline_icdar2015 = [
|
61 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
62 |
+
dict(
|
63 |
+
type='LoadTextAnnotations',
|
64 |
+
with_bbox=True,
|
65 |
+
with_mask=True,
|
66 |
+
poly2mask=False),
|
67 |
+
dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
|
68 |
+
dict(type='Normalize', **img_norm_cfg),
|
69 |
+
dict(
|
70 |
+
type='ScaleAspectJitter',
|
71 |
+
img_scale=img_scale_train_icdar2015,
|
72 |
+
ratio_range=(0.7, 1.3),
|
73 |
+
aspect_ratio_range=(0.9, 1.1),
|
74 |
+
multiscale_mode='value',
|
75 |
+
keep_ratio=False),
|
76 |
+
dict(type='PANetTargets', shrink_ratio=shrink_ratio_train_icdar2015),
|
77 |
+
dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
|
78 |
+
dict(type='RandomRotateTextDet'),
|
79 |
+
dict(
|
80 |
+
type='RandomCropInstances',
|
81 |
+
target_size=target_size_train_icdar2015,
|
82 |
+
instance_key='gt_kernels'),
|
83 |
+
dict(type='Pad', size_divisor=32),
|
84 |
+
dict(
|
85 |
+
type='CustomFormatBundle',
|
86 |
+
keys=['gt_kernels', 'gt_mask'],
|
87 |
+
visualize=dict(flag=False, boundary_key='gt_kernels')),
|
88 |
+
dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
|
89 |
+
]
|
90 |
+
|
91 |
+
img_scale_test_icdar2015 = (1333, 736)
|
92 |
+
test_pipeline_icdar2015 = [
|
93 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
94 |
+
dict(
|
95 |
+
type='MultiScaleFlipAug',
|
96 |
+
img_scale=img_scale_test_icdar2015, # used by Resize
|
97 |
+
flip=False,
|
98 |
+
transforms=[
|
99 |
+
dict(type='Resize', keep_ratio=True),
|
100 |
+
dict(type='Normalize', **img_norm_cfg),
|
101 |
+
dict(type='Pad', size_divisor=32),
|
102 |
+
dict(type='ImageToTensor', keys=['img']),
|
103 |
+
dict(type='Collect', keys=['img']),
|
104 |
+
])
|
105 |
+
]
|
106 |
+
|
107 |
+
# for icdar2017
|
108 |
+
img_scale_train_icdar2017 = [(3000, 800)]
|
109 |
+
shrink_ratio_train_icdar2017 = (1.0, 0.5)
|
110 |
+
target_size_train_icdar2017 = (800, 800)
|
111 |
+
train_pipeline_icdar2017 = [
|
112 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
113 |
+
dict(
|
114 |
+
type='LoadTextAnnotations',
|
115 |
+
with_bbox=True,
|
116 |
+
with_mask=True,
|
117 |
+
poly2mask=False),
|
118 |
+
dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
|
119 |
+
dict(type='Normalize', **img_norm_cfg),
|
120 |
+
dict(
|
121 |
+
type='ScaleAspectJitter',
|
122 |
+
img_scale=img_scale_train_icdar2017,
|
123 |
+
ratio_range=(0.7, 1.3),
|
124 |
+
aspect_ratio_range=(0.9, 1.1),
|
125 |
+
multiscale_mode='value',
|
126 |
+
keep_ratio=False),
|
127 |
+
dict(type='PANetTargets', shrink_ratio=shrink_ratio_train_icdar2017),
|
128 |
+
dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
|
129 |
+
dict(type='RandomRotateTextDet'),
|
130 |
+
dict(
|
131 |
+
type='RandomCropInstances',
|
132 |
+
target_size=target_size_train_icdar2017,
|
133 |
+
instance_key='gt_kernels'),
|
134 |
+
dict(type='Pad', size_divisor=32),
|
135 |
+
dict(
|
136 |
+
type='CustomFormatBundle',
|
137 |
+
keys=['gt_kernels', 'gt_mask'],
|
138 |
+
visualize=dict(flag=False, boundary_key='gt_kernels')),
|
139 |
+
dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
|
140 |
+
]
|
141 |
+
|
142 |
+
img_scale_test_icdar2017 = (1333, 800)
|
143 |
+
test_pipeline_icdar2017 = [
|
144 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
145 |
+
dict(
|
146 |
+
type='MultiScaleFlipAug',
|
147 |
+
img_scale=img_scale_test_icdar2017, # used by Resize
|
148 |
+
flip=False,
|
149 |
+
transforms=[
|
150 |
+
dict(type='Resize', keep_ratio=True),
|
151 |
+
dict(type='Normalize', **img_norm_cfg),
|
152 |
+
dict(type='Pad', size_divisor=32),
|
153 |
+
dict(type='ImageToTensor', keys=['img']),
|
154 |
+
dict(type='Collect', keys=['img']),
|
155 |
+
])
|
156 |
+
]
|
configs/_base_/det_pipelines/psenet_pipeline.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
img_norm_cfg = dict(
|
2 |
+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
3 |
+
|
4 |
+
train_pipeline = [
|
5 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
6 |
+
dict(
|
7 |
+
type='LoadTextAnnotations',
|
8 |
+
with_bbox=True,
|
9 |
+
with_mask=True,
|
10 |
+
poly2mask=False),
|
11 |
+
dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
|
12 |
+
dict(type='Normalize', **img_norm_cfg),
|
13 |
+
dict(
|
14 |
+
type='ScaleAspectJitter',
|
15 |
+
img_scale=[(3000, 736)],
|
16 |
+
ratio_range=(0.5, 3),
|
17 |
+
aspect_ratio_range=(1, 1),
|
18 |
+
multiscale_mode='value',
|
19 |
+
long_size_bound=1280,
|
20 |
+
short_size_bound=640,
|
21 |
+
resize_type='long_short_bound',
|
22 |
+
keep_ratio=False),
|
23 |
+
dict(type='PSENetTargets'),
|
24 |
+
dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
|
25 |
+
dict(type='RandomRotateTextDet'),
|
26 |
+
dict(
|
27 |
+
type='RandomCropInstances',
|
28 |
+
target_size=(640, 640),
|
29 |
+
instance_key='gt_kernels'),
|
30 |
+
dict(type='Pad', size_divisor=32),
|
31 |
+
dict(
|
32 |
+
type='CustomFormatBundle',
|
33 |
+
keys=['gt_kernels', 'gt_mask'],
|
34 |
+
visualize=dict(flag=False, boundary_key='gt_kernels')),
|
35 |
+
dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
|
36 |
+
]
|
37 |
+
|
38 |
+
# for ctw1500
|
39 |
+
img_scale_test_ctw1500 = (1280, 1280)
|
40 |
+
test_pipeline_ctw1500 = [
|
41 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
42 |
+
dict(
|
43 |
+
type='MultiScaleFlipAug',
|
44 |
+
img_scale=img_scale_test_ctw1500, # used by Resize
|
45 |
+
flip=False,
|
46 |
+
transforms=[
|
47 |
+
dict(type='Resize', keep_ratio=True),
|
48 |
+
dict(type='Normalize', **img_norm_cfg),
|
49 |
+
dict(type='Pad', size_divisor=32),
|
50 |
+
dict(type='ImageToTensor', keys=['img']),
|
51 |
+
dict(type='Collect', keys=['img']),
|
52 |
+
])
|
53 |
+
]
|
54 |
+
|
55 |
+
# for icdar2015
|
56 |
+
img_scale_test_icdar2015 = (2240, 2240)
|
57 |
+
test_pipeline_icdar2015 = [
|
58 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
59 |
+
dict(
|
60 |
+
type='MultiScaleFlipAug',
|
61 |
+
img_scale=img_scale_test_icdar2015, # used by Resize
|
62 |
+
flip=False,
|
63 |
+
transforms=[
|
64 |
+
dict(type='Resize', keep_ratio=True),
|
65 |
+
dict(type='Normalize', **img_norm_cfg),
|
66 |
+
dict(type='Pad', size_divisor=32),
|
67 |
+
dict(type='ImageToTensor', keys=['img']),
|
68 |
+
dict(type='Collect', keys=['img']),
|
69 |
+
])
|
70 |
+
]
|
configs/_base_/det_pipelines/textsnake_pipeline.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
img_norm_cfg = dict(
|
2 |
+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
3 |
+
|
4 |
+
train_pipeline = [
|
5 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
6 |
+
dict(
|
7 |
+
type='LoadTextAnnotations',
|
8 |
+
with_bbox=True,
|
9 |
+
with_mask=True,
|
10 |
+
poly2mask=False),
|
11 |
+
dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
|
12 |
+
dict(type='Normalize', **img_norm_cfg),
|
13 |
+
dict(
|
14 |
+
type='RandomCropPolyInstances',
|
15 |
+
instance_key='gt_masks',
|
16 |
+
crop_ratio=0.65,
|
17 |
+
min_side_ratio=0.3),
|
18 |
+
dict(
|
19 |
+
type='RandomRotatePolyInstances',
|
20 |
+
rotate_ratio=0.5,
|
21 |
+
max_angle=20,
|
22 |
+
pad_with_fixed_color=False),
|
23 |
+
dict(
|
24 |
+
type='ScaleAspectJitter',
|
25 |
+
img_scale=[(3000, 736)], # unused
|
26 |
+
ratio_range=(0.7, 1.3),
|
27 |
+
aspect_ratio_range=(0.9, 1.1),
|
28 |
+
multiscale_mode='value',
|
29 |
+
long_size_bound=800,
|
30 |
+
short_size_bound=480,
|
31 |
+
resize_type='long_short_bound',
|
32 |
+
keep_ratio=False),
|
33 |
+
dict(type='SquareResizePad', target_size=800, pad_ratio=0.6),
|
34 |
+
dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
|
35 |
+
dict(type='TextSnakeTargets'),
|
36 |
+
dict(type='Pad', size_divisor=32),
|
37 |
+
dict(
|
38 |
+
type='CustomFormatBundle',
|
39 |
+
keys=[
|
40 |
+
'gt_text_mask', 'gt_center_region_mask', 'gt_mask',
|
41 |
+
'gt_radius_map', 'gt_sin_map', 'gt_cos_map'
|
42 |
+
],
|
43 |
+
visualize=dict(flag=False, boundary_key='gt_text_mask')),
|
44 |
+
dict(
|
45 |
+
type='Collect',
|
46 |
+
keys=[
|
47 |
+
'img', 'gt_text_mask', 'gt_center_region_mask', 'gt_mask',
|
48 |
+
'gt_radius_map', 'gt_sin_map', 'gt_cos_map'
|
49 |
+
])
|
50 |
+
]
|
51 |
+
|
52 |
+
test_pipeline = [
|
53 |
+
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
54 |
+
dict(
|
55 |
+
type='MultiScaleFlipAug',
|
56 |
+
img_scale=(1333, 736), # used by Resize
|
57 |
+
flip=False,
|
58 |
+
transforms=[
|
59 |
+
dict(type='Resize', keep_ratio=True),
|
60 |
+
dict(type='Normalize', **img_norm_cfg),
|
61 |
+
dict(type='Pad', size_divisor=32),
|
62 |
+
dict(type='ImageToTensor', keys=['img']),
|
63 |
+
dict(type='Collect', keys=['img']),
|
64 |
+
])
|
65 |
+
]
|
configs/_base_/recog_datasets/MJ_train.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Text Recognition Training set, including:
|
2 |
+
# Synthetic Datasets: Syn90k
|
3 |
+
|
4 |
+
train_root = 'data/mixture/Syn90k'
|
5 |
+
|
6 |
+
train_img_prefix = f'{train_root}/mnt/ramdisk/max/90kDICT32px'
|
7 |
+
train_ann_file = f'{train_root}/label.lmdb'
|
8 |
+
|
9 |
+
train = dict(
|
10 |
+
type='OCRDataset',
|
11 |
+
img_prefix=train_img_prefix,
|
12 |
+
ann_file=train_ann_file,
|
13 |
+
loader=dict(
|
14 |
+
type='AnnFileLoader',
|
15 |
+
repeat=1,
|
16 |
+
file_format='lmdb',
|
17 |
+
parser=dict(type='LineJsonParser', keys=['filename', 'text'])),
|
18 |
+
pipeline=None,
|
19 |
+
test_mode=False)
|
20 |
+
|
21 |
+
train_list = [train]
|
configs/_base_/recog_datasets/ST_MJ_alphanumeric_train.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Text Recognition Training set, including:
|
2 |
+
# Synthetic Datasets: SynthText, Syn90k
|
3 |
+
# Both annotations are filtered so that
|
4 |
+
# only alphanumeric terms are left
|
5 |
+
|
6 |
+
train_root = 'data/mixture'
|
7 |
+
|
8 |
+
train_img_prefix1 = f'{train_root}/Syn90k/mnt/ramdisk/max/90kDICT32px'
|
9 |
+
train_ann_file1 = f'{train_root}/Syn90k/label.lmdb'
|
10 |
+
|
11 |
+
train1 = dict(
|
12 |
+
type='OCRDataset',
|
13 |
+
img_prefix=train_img_prefix1,
|
14 |
+
ann_file=train_ann_file1,
|
15 |
+
loader=dict(
|
16 |
+
type='AnnFileLoader',
|
17 |
+
repeat=1,
|
18 |
+
file_format='lmdb',
|
19 |
+
parser=dict(type='LineJsonParser', keys=['filename', 'text'])),
|
20 |
+
pipeline=None,
|
21 |
+
test_mode=False)
|
22 |
+
|
23 |
+
train_img_prefix2 = f'{train_root}/SynthText/' + \
|
24 |
+
'synthtext/SynthText_patch_horizontal'
|
25 |
+
train_ann_file2 = f'{train_root}/SynthText/alphanumeric_label.lmdb'
|
26 |
+
|
27 |
+
train2 = {key: value for key, value in train1.items()}
|
28 |
+
train2['img_prefix'] = train_img_prefix2
|
29 |
+
train2['ann_file'] = train_ann_file2
|
30 |
+
|
31 |
+
train_list = [train1, train2]
|
configs/_base_/recog_datasets/ST_MJ_train.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Text Recognition Training set, including:
|
2 |
+
# Synthetic Datasets: SynthText, Syn90k
|
3 |
+
|
4 |
+
train_root = 'data/mixture'
|
5 |
+
|
6 |
+
train_img_prefix1 = f'{train_root}/Syn90k/mnt/ramdisk/max/90kDICT32px'
|
7 |
+
train_ann_file1 = f'{train_root}/Syn90k/label.lmdb'
|
8 |
+
|
9 |
+
train1 = dict(
|
10 |
+
type='OCRDataset',
|
11 |
+
img_prefix=train_img_prefix1,
|
12 |
+
ann_file=train_ann_file1,
|
13 |
+
loader=dict(
|
14 |
+
type='AnnFileLoader',
|
15 |
+
repeat=1,
|
16 |
+
file_format='lmdb',
|
17 |
+
parser=dict(type='LineJsonParser', keys=['filename', 'text'])),
|
18 |
+
pipeline=None,
|
19 |
+
test_mode=False)
|
20 |
+
|
21 |
+
train_img_prefix2 = f'{train_root}/SynthText/' + \
|
22 |
+
'synthtext/SynthText_patch_horizontal'
|
23 |
+
train_ann_file2 = f'{train_root}/SynthText/label.lmdb'
|
24 |
+
|
25 |
+
train2 = {key: value for key, value in train1.items()}
|
26 |
+
train2['img_prefix'] = train_img_prefix2
|
27 |
+
train2['ann_file'] = train_ann_file2
|
28 |
+
|
29 |
+
train_list = [train1, train2]
|
configs/_base_/recog_datasets/ST_SA_MJ_real_train.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Text Recognition Training set, including:
|
2 |
+
# Synthetic Datasets: SynthText, SynthAdd, Syn90k
|
3 |
+
# Real Dataset: IC11, IC13, IC15, COCO-Test, IIIT5k
|
4 |
+
|
5 |
+
train_prefix = 'data/mixture'
|
6 |
+
|
7 |
+
train_img_prefix1 = f'{train_prefix}/icdar_2011'
|
8 |
+
train_img_prefix2 = f'{train_prefix}/icdar_2013'
|
9 |
+
train_img_prefix3 = f'{train_prefix}/icdar_2015'
|
10 |
+
train_img_prefix4 = f'{train_prefix}/coco_text'
|
11 |
+
train_img_prefix5 = f'{train_prefix}/IIIT5K'
|
12 |
+
train_img_prefix6 = f'{train_prefix}/SynthText_Add'
|
13 |
+
train_img_prefix7 = f'{train_prefix}/SynthText'
|
14 |
+
train_img_prefix8 = f'{train_prefix}/Syn90k'
|
15 |
+
|
16 |
+
train_ann_file1 = f'{train_prefix}/icdar_2011/train_label.txt',
|
17 |
+
train_ann_file2 = f'{train_prefix}/icdar_2013/train_label.txt',
|
18 |
+
train_ann_file3 = f'{train_prefix}/icdar_2015/train_label.txt',
|
19 |
+
train_ann_file4 = f'{train_prefix}/coco_text/train_label.txt',
|
20 |
+
train_ann_file5 = f'{train_prefix}/IIIT5K/train_label.txt',
|
21 |
+
train_ann_file6 = f'{train_prefix}/SynthText_Add/label.txt',
|
22 |
+
train_ann_file7 = f'{train_prefix}/SynthText/shuffle_labels.txt',
|
23 |
+
train_ann_file8 = f'{train_prefix}/Syn90k/shuffle_labels.txt'
|
24 |
+
|
25 |
+
train1 = dict(
|
26 |
+
type='OCRDataset',
|
27 |
+
img_prefix=train_img_prefix1,
|
28 |
+
ann_file=train_ann_file1,
|
29 |
+
loader=dict(
|
30 |
+
type='AnnFileLoader',
|
31 |
+
repeat=20,
|
32 |
+
file_format='txt',
|
33 |
+
parser=dict(
|
34 |
+
type='LineStrParser',
|
35 |
+
keys=['filename', 'text'],
|
36 |
+
keys_idx=[0, 1],
|
37 |
+
separator=' ')),
|
38 |
+
pipeline=None,
|
39 |
+
test_mode=False)
|
40 |
+
|
41 |
+
train2 = {key: value for key, value in train1.items()}
|
42 |
+
train2['img_prefix'] = train_img_prefix2
|
43 |
+
train2['ann_file'] = train_ann_file2
|
44 |
+
|
45 |
+
train3 = {key: value for key, value in train1.items()}
|
46 |
+
train3['img_prefix'] = train_img_prefix3
|
47 |
+
train3['ann_file'] = train_ann_file3
|
48 |
+
|
49 |
+
train4 = {key: value for key, value in train1.items()}
|
50 |
+
train4['img_prefix'] = train_img_prefix4
|
51 |
+
train4['ann_file'] = train_ann_file4
|
52 |
+
|
53 |
+
train5 = {key: value for key, value in train1.items()}
|
54 |
+
train5['img_prefix'] = train_img_prefix5
|
55 |
+
train5['ann_file'] = train_ann_file5
|
56 |
+
|
57 |
+
train6 = dict(
|
58 |
+
type='OCRDataset',
|
59 |
+
img_prefix=train_img_prefix6,
|
60 |
+
ann_file=train_ann_file6,
|
61 |
+
loader=dict(
|
62 |
+
type='AnnFileLoader',
|
63 |
+
repeat=1,
|
64 |
+
file_format='txt',
|
65 |
+
parser=dict(
|
66 |
+
type='LineStrParser',
|
67 |
+
keys=['filename', 'text'],
|
68 |
+
keys_idx=[0, 1],
|
69 |
+
separator=' ')),
|
70 |
+
pipeline=None,
|
71 |
+
test_mode=False)
|
72 |
+
|
73 |
+
train7 = {key: value for key, value in train6.items()}
|
74 |
+
train7['img_prefix'] = train_img_prefix7
|
75 |
+
train7['ann_file'] = train_ann_file7
|
76 |
+
|
77 |
+
train8 = {key: value for key, value in train6.items()}
|
78 |
+
train8['img_prefix'] = train_img_prefix8
|
79 |
+
train8['ann_file'] = train_ann_file8
|
80 |
+
|
81 |
+
train_list = [train1, train2, train3, train4, train5, train6, train7, train8]
|
configs/_base_/recog_datasets/ST_SA_MJ_train.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Text Recognition Training set, including:
|
2 |
+
# Synthetic Datasets: SynthText, Syn90k
|
3 |
+
|
4 |
+
train_root = 'data/mixture'
|
5 |
+
|
6 |
+
train_img_prefix1 = f'{train_root}/Syn90k/mnt/ramdisk/max/90kDICT32px'
|
7 |
+
train_ann_file1 = f'{train_root}/Syn90k/label.lmdb'
|
8 |
+
|
9 |
+
train1 = dict(
|
10 |
+
type='OCRDataset',
|
11 |
+
img_prefix=train_img_prefix1,
|
12 |
+
ann_file=train_ann_file1,
|
13 |
+
loader=dict(
|
14 |
+
type='AnnFileLoader',
|
15 |
+
repeat=1,
|
16 |
+
file_format='lmdb',
|
17 |
+
parser=dict(type='LineJsonParser', keys=['filename', 'text'])),
|
18 |
+
pipeline=None,
|
19 |
+
test_mode=False)
|
20 |
+
|
21 |
+
train_img_prefix2 = f'{train_root}/SynthText/' + \
|
22 |
+
'synthtext/SynthText_patch_horizontal'
|
23 |
+
train_ann_file2 = f'{train_root}/SynthText/label.lmdb'
|
24 |
+
|
25 |
+
train_img_prefix3 = f'{train_root}/SynthText_Add'
|
26 |
+
train_ann_file3 = f'{train_root}/SynthText_Add/label.txt'
|
27 |
+
|
28 |
+
train2 = {key: value for key, value in train1.items()}
|
29 |
+
train2['img_prefix'] = train_img_prefix2
|
30 |
+
train2['ann_file'] = train_ann_file2
|
31 |
+
|
32 |
+
train3 = dict(
|
33 |
+
type='OCRDataset',
|
34 |
+
img_prefix=train_img_prefix3,
|
35 |
+
ann_file=train_ann_file3,
|
36 |
+
loader=dict(
|
37 |
+
type='AnnFileLoader',
|
38 |
+
repeat=1,
|
39 |
+
file_format='txt',
|
40 |
+
parser=dict(
|
41 |
+
type='LineStrParser',
|
42 |
+
keys=['filename', 'text'],
|
43 |
+
keys_idx=[0, 1],
|
44 |
+
separator=' ')),
|
45 |
+
pipeline=None,
|
46 |
+
test_mode=False)
|
47 |
+
|
48 |
+
train_list = [train1, train2, train3]
|
configs/_base_/recog_datasets/ST_charbox_train.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Text Recognition Training set, including:
|
2 |
+
# Synthetic Datasets: SynthText (with character level boxes)
|
3 |
+
|
4 |
+
train_img_root = 'data/mixture'
|
5 |
+
|
6 |
+
train_img_prefix = f'{train_img_root}/SynthText'
|
7 |
+
|
8 |
+
train_ann_file = f'{train_img_root}/SynthText/instances_train.txt'
|
9 |
+
|
10 |
+
train = dict(
|
11 |
+
type='OCRSegDataset',
|
12 |
+
img_prefix=train_img_prefix,
|
13 |
+
ann_file=train_ann_file,
|
14 |
+
loader=dict(
|
15 |
+
type='AnnFileLoader',
|
16 |
+
repeat=1,
|
17 |
+
file_format='txt',
|
18 |
+
parser=dict(
|
19 |
+
type='LineJsonParser', keys=['file_name', 'annotations', 'text'])),
|
20 |
+
pipeline=None,
|
21 |
+
test_mode=False)
|
22 |
+
|
23 |
+
train_list = [train]
|
configs/_base_/recog_datasets/academic_test.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Text Recognition Testing set, including:
|
2 |
+
# Regular Datasets: IIIT5K, SVT, IC13
|
3 |
+
# Irregular Datasets: IC15, SVTP, CT80
|
4 |
+
|
5 |
+
test_root = 'data/mixture'
|
6 |
+
|
7 |
+
test_img_prefix1 = f'{test_root}/IIIT5K/'
|
8 |
+
test_img_prefix2 = f'{test_root}/svt/'
|
9 |
+
test_img_prefix3 = f'{test_root}/icdar_2013/'
|
10 |
+
test_img_prefix4 = f'{test_root}/icdar_2015/'
|
11 |
+
test_img_prefix5 = f'{test_root}/svtp/'
|
12 |
+
test_img_prefix6 = f'{test_root}/ct80/'
|
13 |
+
|
14 |
+
test_ann_file1 = f'{test_root}/IIIT5K/test_label.txt'
|
15 |
+
test_ann_file2 = f'{test_root}/svt/test_label.txt'
|
16 |
+
test_ann_file3 = f'{test_root}/icdar_2013/test_label_1015.txt'
|
17 |
+
test_ann_file4 = f'{test_root}/icdar_2015/test_label.txt'
|
18 |
+
test_ann_file5 = f'{test_root}/svtp/test_label.txt'
|
19 |
+
test_ann_file6 = f'{test_root}/ct80/test_label.txt'
|
20 |
+
|
21 |
+
test1 = dict(
|
22 |
+
type='OCRDataset',
|
23 |
+
img_prefix=test_img_prefix1,
|
24 |
+
ann_file=test_ann_file1,
|
25 |
+
loader=dict(
|
26 |
+
type='AnnFileLoader',
|
27 |
+
repeat=1,
|
28 |
+
file_format='txt',
|
29 |
+
parser=dict(
|
30 |
+
type='LineStrParser',
|
31 |
+
keys=['filename', 'text'],
|
32 |
+
keys_idx=[0, 1],
|
33 |
+
separator=' ')),
|
34 |
+
pipeline=None,
|
35 |
+
test_mode=True)
|
36 |
+
|
37 |
+
test2 = {key: value for key, value in test1.items()}
|
38 |
+
test2['img_prefix'] = test_img_prefix2
|
39 |
+
test2['ann_file'] = test_ann_file2
|
40 |
+
|
41 |
+
test3 = {key: value for key, value in test1.items()}
|
42 |
+
test3['img_prefix'] = test_img_prefix3
|
43 |
+
test3['ann_file'] = test_ann_file3
|
44 |
+
|
45 |
+
test4 = {key: value for key, value in test1.items()}
|
46 |
+
test4['img_prefix'] = test_img_prefix4
|
47 |
+
test4['ann_file'] = test_ann_file4
|
48 |
+
|
49 |
+
test5 = {key: value for key, value in test1.items()}
|
50 |
+
test5['img_prefix'] = test_img_prefix5
|
51 |
+
test5['ann_file'] = test_ann_file5
|
52 |
+
|
53 |
+
test6 = {key: value for key, value in test1.items()}
|
54 |
+
test6['img_prefix'] = test_img_prefix6
|
55 |
+
test6['ann_file'] = test_ann_file6
|
56 |
+
|
57 |
+
test_list = [test1, test2, test3, test4, test5, test6]
|
configs/_base_/recog_datasets/seg_toy_data.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
prefix = 'tests/data/ocr_char_ann_toy_dataset/'
|
2 |
+
|
3 |
+
train = dict(
|
4 |
+
type='OCRSegDataset',
|
5 |
+
img_prefix=f'{prefix}/imgs',
|
6 |
+
ann_file=f'{prefix}/instances_train.txt',
|
7 |
+
loader=dict(
|
8 |
+
type='AnnFileLoader',
|
9 |
+
repeat=100,
|
10 |
+
file_format='txt',
|
11 |
+
parser=dict(
|
12 |
+
type='LineJsonParser', keys=['file_name', 'annotations', 'text'])),
|
13 |
+
pipeline=None,
|
14 |
+
test_mode=True)
|
15 |
+
|
16 |
+
test = dict(
|
17 |
+
type='OCRDataset',
|
18 |
+
img_prefix=f'{prefix}/imgs',
|
19 |
+
ann_file=f'{prefix}/instances_test.txt',
|
20 |
+
loader=dict(
|
21 |
+
type='AnnFileLoader',
|
22 |
+
repeat=1,
|
23 |
+
file_format='txt',
|
24 |
+
parser=dict(
|
25 |
+
type='LineStrParser',
|
26 |
+
keys=['filename', 'text'],
|
27 |
+
keys_idx=[0, 1],
|
28 |
+
separator=' ')),
|
29 |
+
pipeline=None,
|
30 |
+
test_mode=True)
|
31 |
+
|
32 |
+
train_list = [train]
|
33 |
+
|
34 |
+
test_list = [test]
|
configs/_base_/recog_datasets/toy_data.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset_type = 'OCRDataset'
|
2 |
+
|
3 |
+
root = 'tests/data/ocr_toy_dataset'
|
4 |
+
img_prefix = f'{root}/imgs'
|
5 |
+
train_anno_file1 = f'{root}/label.txt'
|
6 |
+
|
7 |
+
train1 = dict(
|
8 |
+
type=dataset_type,
|
9 |
+
img_prefix=img_prefix,
|
10 |
+
ann_file=train_anno_file1,
|
11 |
+
loader=dict(
|
12 |
+
type='AnnFileLoader',
|
13 |
+
repeat=100,
|
14 |
+
file_format='txt',
|
15 |
+
file_storage_backend='disk',
|
16 |
+
parser=dict(
|
17 |
+
type='LineStrParser',
|
18 |
+
keys=['filename', 'text'],
|
19 |
+
keys_idx=[0, 1],
|
20 |
+
separator=' ')),
|
21 |
+
pipeline=None,
|
22 |
+
test_mode=False)
|
23 |
+
|
24 |
+
train_anno_file2 = f'{root}/label.lmdb'
|
25 |
+
train2 = dict(
|
26 |
+
type=dataset_type,
|
27 |
+
img_prefix=img_prefix,
|
28 |
+
ann_file=train_anno_file2,
|
29 |
+
loader=dict(
|
30 |
+
type='AnnFileLoader',
|
31 |
+
repeat=100,
|
32 |
+
file_format='lmdb',
|
33 |
+
file_storage_backend='disk',
|
34 |
+
parser=dict(type='LineJsonParser', keys=['filename', 'text'])),
|
35 |
+
pipeline=None,
|
36 |
+
test_mode=False)
|
37 |
+
|
38 |
+
test_anno_file1 = f'{root}/label.lmdb'
|
39 |
+
test = dict(
|
40 |
+
type=dataset_type,
|
41 |
+
img_prefix=img_prefix,
|
42 |
+
ann_file=test_anno_file1,
|
43 |
+
loader=dict(
|
44 |
+
type='AnnFileLoader',
|
45 |
+
repeat=1,
|
46 |
+
file_format='lmdb',
|
47 |
+
file_storage_backend='disk',
|
48 |
+
parser=dict(type='LineJsonParser', keys=['filename', 'text'])),
|
49 |
+
pipeline=None,
|
50 |
+
test_mode=True)
|
51 |
+
|
52 |
+
train_list = [train1, train2]
|
53 |
+
|
54 |
+
test_list = [test]
|
configs/_base_/recog_models/abinet.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# num_chars depends on the configuration of label_convertor. The actual
|
2 |
+
# dictionary size is 36 + 1 (<BOS/EOS>).
|
3 |
+
# TODO: Automatically update num_chars based on the configuration of
|
4 |
+
# label_convertor
|
5 |
+
num_chars = 37
|
6 |
+
max_seq_len = 26
|
7 |
+
|
8 |
+
label_convertor = dict(
|
9 |
+
type='ABIConvertor',
|
10 |
+
dict_type='DICT36',
|
11 |
+
with_unknown=False,
|
12 |
+
with_padding=False,
|
13 |
+
lower=True,
|
14 |
+
)
|
15 |
+
|
16 |
+
model = dict(
|
17 |
+
type='ABINet',
|
18 |
+
backbone=dict(type='ResNetABI'),
|
19 |
+
encoder=dict(
|
20 |
+
type='ABIVisionModel',
|
21 |
+
encoder=dict(
|
22 |
+
type='TransformerEncoder',
|
23 |
+
n_layers=3,
|
24 |
+
n_head=8,
|
25 |
+
d_model=512,
|
26 |
+
d_inner=2048,
|
27 |
+
dropout=0.1,
|
28 |
+
max_len=8 * 32,
|
29 |
+
),
|
30 |
+
decoder=dict(
|
31 |
+
type='ABIVisionDecoder',
|
32 |
+
in_channels=512,
|
33 |
+
num_channels=64,
|
34 |
+
attn_height=8,
|
35 |
+
attn_width=32,
|
36 |
+
attn_mode='nearest',
|
37 |
+
use_result='feature',
|
38 |
+
num_chars=num_chars,
|
39 |
+
max_seq_len=max_seq_len,
|
40 |
+
init_cfg=dict(type='Xavier', layer='Conv2d')),
|
41 |
+
),
|
42 |
+
decoder=dict(
|
43 |
+
type='ABILanguageDecoder',
|
44 |
+
d_model=512,
|
45 |
+
n_head=8,
|
46 |
+
d_inner=2048,
|
47 |
+
n_layers=4,
|
48 |
+
dropout=0.1,
|
49 |
+
detach_tokens=True,
|
50 |
+
use_self_attn=False,
|
51 |
+
pad_idx=num_chars - 1,
|
52 |
+
num_chars=num_chars,
|
53 |
+
max_seq_len=max_seq_len,
|
54 |
+
init_cfg=None),
|
55 |
+
fuser=dict(
|
56 |
+
type='ABIFuser',
|
57 |
+
d_model=512,
|
58 |
+
num_chars=num_chars,
|
59 |
+
init_cfg=None,
|
60 |
+
max_seq_len=max_seq_len,
|
61 |
+
),
|
62 |
+
loss=dict(
|
63 |
+
type='ABILoss',
|
64 |
+
enc_weight=1.0,
|
65 |
+
dec_weight=1.0,
|
66 |
+
fusion_weight=1.0,
|
67 |
+
num_classes=num_chars),
|
68 |
+
label_convertor=label_convertor,
|
69 |
+
max_seq_len=max_seq_len,
|
70 |
+
iter_size=3)
|
configs/_base_/recog_models/crnn.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
label_convertor = dict(
|
2 |
+
type='CTCConvertor', dict_type='DICT36', with_unknown=False, lower=True)
|
3 |
+
|
4 |
+
model = dict(
|
5 |
+
type='CRNNNet',
|
6 |
+
preprocessor=None,
|
7 |
+
backbone=dict(type='VeryDeepVgg', leaky_relu=False, input_channels=1),
|
8 |
+
encoder=None,
|
9 |
+
decoder=dict(type='CRNNDecoder', in_channels=512, rnn_flag=True),
|
10 |
+
loss=dict(type='CTCLoss'),
|
11 |
+
label_convertor=label_convertor,
|
12 |
+
pretrained=None)
|
configs/_base_/recog_models/crnn_tps.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model
|
2 |
+
label_convertor = dict(
|
3 |
+
type='CTCConvertor', dict_type='DICT36', with_unknown=False, lower=True)
|
4 |
+
|
5 |
+
model = dict(
|
6 |
+
type='CRNNNet',
|
7 |
+
preprocessor=dict(
|
8 |
+
type='TPSPreprocessor',
|
9 |
+
num_fiducial=20,
|
10 |
+
img_size=(32, 100),
|
11 |
+
rectified_img_size=(32, 100),
|
12 |
+
num_img_channel=1),
|
13 |
+
backbone=dict(type='VeryDeepVgg', leaky_relu=False, input_channels=1),
|
14 |
+
encoder=None,
|
15 |
+
decoder=dict(type='CRNNDecoder', in_channels=512, rnn_flag=True),
|
16 |
+
loss=dict(type='CTCLoss'),
|
17 |
+
label_convertor=label_convertor,
|
18 |
+
pretrained=None)
|
configs/_base_/recog_models/master.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
label_convertor = dict(
|
2 |
+
type='AttnConvertor', dict_type='DICT90', with_unknown=True)
|
3 |
+
|
4 |
+
model = dict(
|
5 |
+
type='MASTER',
|
6 |
+
backbone=dict(
|
7 |
+
type='ResNet',
|
8 |
+
in_channels=3,
|
9 |
+
stem_channels=[64, 128],
|
10 |
+
block_cfgs=dict(
|
11 |
+
type='BasicBlock',
|
12 |
+
plugins=dict(
|
13 |
+
cfg=dict(
|
14 |
+
type='GCAModule',
|
15 |
+
ratio=0.0625,
|
16 |
+
n_head=1,
|
17 |
+
pooling_type='att',
|
18 |
+
is_att_scale=False,
|
19 |
+
fusion_type='channel_add'),
|
20 |
+
position='after_conv2')),
|
21 |
+
arch_layers=[1, 2, 5, 3],
|
22 |
+
arch_channels=[256, 256, 512, 512],
|
23 |
+
strides=[1, 1, 1, 1],
|
24 |
+
plugins=[
|
25 |
+
dict(
|
26 |
+
cfg=dict(type='Maxpool2d', kernel_size=2, stride=(2, 2)),
|
27 |
+
stages=(True, True, False, False),
|
28 |
+
position='before_stage'),
|
29 |
+
dict(
|
30 |
+
cfg=dict(type='Maxpool2d', kernel_size=(2, 1), stride=(2, 1)),
|
31 |
+
stages=(False, False, True, False),
|
32 |
+
position='before_stage'),
|
33 |
+
dict(
|
34 |
+
cfg=dict(
|
35 |
+
type='ConvModule',
|
36 |
+
kernel_size=3,
|
37 |
+
stride=1,
|
38 |
+
padding=1,
|
39 |
+
norm_cfg=dict(type='BN'),
|
40 |
+
act_cfg=dict(type='ReLU')),
|
41 |
+
stages=(True, True, True, True),
|
42 |
+
position='after_stage')
|
43 |
+
],
|
44 |
+
init_cfg=[
|
45 |
+
dict(type='Kaiming', layer='Conv2d'),
|
46 |
+
dict(type='Constant', val=1, layer='BatchNorm2d'),
|
47 |
+
]),
|
48 |
+
encoder=None,
|
49 |
+
decoder=dict(
|
50 |
+
type='MasterDecoder',
|
51 |
+
d_model=512,
|
52 |
+
n_head=8,
|
53 |
+
attn_drop=0.,
|
54 |
+
ffn_drop=0.,
|
55 |
+
d_inner=2048,
|
56 |
+
n_layers=3,
|
57 |
+
feat_pe_drop=0.2,
|
58 |
+
feat_size=6 * 40),
|
59 |
+
loss=dict(type='TFLoss', reduction='mean'),
|
60 |
+
label_convertor=label_convertor,
|
61 |
+
max_seq_len=30)
|
configs/_base_/recog_models/nrtr_modality_transform.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
label_convertor = dict(
|
2 |
+
type='AttnConvertor', dict_type='DICT36', with_unknown=True, lower=True)
|
3 |
+
|
4 |
+
model = dict(
|
5 |
+
type='NRTR',
|
6 |
+
backbone=dict(type='NRTRModalityTransform'),
|
7 |
+
encoder=dict(type='NRTREncoder', n_layers=12),
|
8 |
+
decoder=dict(type='NRTRDecoder'),
|
9 |
+
loss=dict(type='TFLoss'),
|
10 |
+
label_convertor=label_convertor,
|
11 |
+
max_seq_len=40)
|
configs/_base_/recog_models/robust_scanner.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
label_convertor = dict(
|
2 |
+
type='AttnConvertor', dict_type='DICT90', with_unknown=True)
|
3 |
+
|
4 |
+
hybrid_decoder = dict(type='SequenceAttentionDecoder')
|
5 |
+
|
6 |
+
position_decoder = dict(type='PositionAttentionDecoder')
|
7 |
+
|
8 |
+
model = dict(
|
9 |
+
type='RobustScanner',
|
10 |
+
backbone=dict(type='ResNet31OCR'),
|
11 |
+
encoder=dict(
|
12 |
+
type='ChannelReductionEncoder',
|
13 |
+
in_channels=512,
|
14 |
+
out_channels=128,
|
15 |
+
),
|
16 |
+
decoder=dict(
|
17 |
+
type='RobustScannerDecoder',
|
18 |
+
dim_input=512,
|
19 |
+
dim_model=128,
|
20 |
+
hybrid_decoder=hybrid_decoder,
|
21 |
+
position_decoder=position_decoder),
|
22 |
+
loss=dict(type='SARLoss'),
|
23 |
+
label_convertor=label_convertor,
|
24 |
+
max_seq_len=30)
|
configs/_base_/recog_models/sar.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
label_convertor = dict(
|
2 |
+
type='AttnConvertor', dict_type='DICT90', with_unknown=True)
|
3 |
+
|
4 |
+
model = dict(
|
5 |
+
type='SARNet',
|
6 |
+
backbone=dict(type='ResNet31OCR'),
|
7 |
+
encoder=dict(
|
8 |
+
type='SAREncoder',
|
9 |
+
enc_bi_rnn=False,
|
10 |
+
enc_do_rnn=0.1,
|
11 |
+
enc_gru=False,
|
12 |
+
),
|
13 |
+
decoder=dict(
|
14 |
+
type='ParallelSARDecoder',
|
15 |
+
enc_bi_rnn=False,
|
16 |
+
dec_bi_rnn=False,
|
17 |
+
dec_do_rnn=0,
|
18 |
+
dec_gru=False,
|
19 |
+
pred_dropout=0.1,
|
20 |
+
d_k=512,
|
21 |
+
pred_concat=True),
|
22 |
+
loss=dict(type='SARLoss'),
|
23 |
+
label_convertor=label_convertor,
|
24 |
+
max_seq_len=30)
|
configs/_base_/recog_models/satrn.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
label_convertor = dict(
|
2 |
+
type='AttnConvertor', dict_type='DICT36', with_unknown=True, lower=True)
|
3 |
+
|
4 |
+
model = dict(
|
5 |
+
type='SATRN',
|
6 |
+
backbone=dict(type='ShallowCNN'),
|
7 |
+
encoder=dict(type='SatrnEncoder'),
|
8 |
+
decoder=dict(type='TFDecoder'),
|
9 |
+
loss=dict(type='TFLoss'),
|
10 |
+
label_convertor=label_convertor,
|
11 |
+
max_seq_len=40)
|
configs/_base_/recog_models/seg.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
label_convertor = dict(
|
2 |
+
type='SegConvertor', dict_type='DICT36', with_unknown=True, lower=True)
|
3 |
+
|
4 |
+
model = dict(
|
5 |
+
type='SegRecognizer',
|
6 |
+
backbone=dict(
|
7 |
+
type='ResNet31OCR',
|
8 |
+
layers=[1, 2, 5, 3],
|
9 |
+
channels=[32, 64, 128, 256, 512, 512],
|
10 |
+
out_indices=[0, 1, 2, 3],
|
11 |
+
stage4_pool_cfg=dict(kernel_size=2, stride=2),
|
12 |
+
last_stage_pool=True),
|
13 |
+
neck=dict(
|
14 |
+
type='FPNOCR', in_channels=[128, 256, 512, 512], out_channels=256),
|
15 |
+
head=dict(
|
16 |
+
type='SegHead',
|
17 |
+
in_channels=256,
|
18 |
+
upsample_param=dict(scale_factor=2.0, mode='nearest')),
|
19 |
+
loss=dict(
|
20 |
+
type='SegLoss', seg_downsample_ratio=1.0, seg_with_loss_weight=True),
|
21 |
+
label_convertor=label_convertor)
|
configs/_base_/recog_pipelines/abinet_pipeline.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
2 |
+
train_pipeline = [
|
3 |
+
dict(type='LoadImageFromFile'),
|
4 |
+
dict(
|
5 |
+
type='ResizeOCR',
|
6 |
+
height=32,
|
7 |
+
min_width=128,
|
8 |
+
max_width=128,
|
9 |
+
keep_aspect_ratio=False,
|
10 |
+
width_downsample_ratio=0.25),
|
11 |
+
dict(
|
12 |
+
type='RandomWrapper',
|
13 |
+
p=0.5,
|
14 |
+
transforms=[
|
15 |
+
dict(
|
16 |
+
type='OneOfWrapper',
|
17 |
+
transforms=[
|
18 |
+
dict(
|
19 |
+
type='RandomRotateTextDet',
|
20 |
+
max_angle=15,
|
21 |
+
),
|
22 |
+
dict(
|
23 |
+
type='TorchVisionWrapper',
|
24 |
+
op='RandomAffine',
|
25 |
+
degrees=15,
|
26 |
+
translate=(0.3, 0.3),
|
27 |
+
scale=(0.5, 2.),
|
28 |
+
shear=(-45, 45),
|
29 |
+
),
|
30 |
+
dict(
|
31 |
+
type='TorchVisionWrapper',
|
32 |
+
op='RandomPerspective',
|
33 |
+
distortion_scale=0.5,
|
34 |
+
p=1,
|
35 |
+
),
|
36 |
+
])
|
37 |
+
],
|
38 |
+
),
|
39 |
+
dict(
|
40 |
+
type='RandomWrapper',
|
41 |
+
p=0.25,
|
42 |
+
transforms=[
|
43 |
+
dict(type='PyramidRescale'),
|
44 |
+
dict(
|
45 |
+
type='Albu',
|
46 |
+
transforms=[
|
47 |
+
dict(type='GaussNoise', var_limit=(20, 20), p=0.5),
|
48 |
+
dict(type='MotionBlur', blur_limit=6, p=0.5),
|
49 |
+
]),
|
50 |
+
]),
|
51 |
+
dict(
|
52 |
+
type='RandomWrapper',
|
53 |
+
p=0.25,
|
54 |
+
transforms=[
|
55 |
+
dict(
|
56 |
+
type='TorchVisionWrapper',
|
57 |
+
op='ColorJitter',
|
58 |
+
brightness=0.5,
|
59 |
+
saturation=0.5,
|
60 |
+
contrast=0.5,
|
61 |
+
hue=0.1),
|
62 |
+
]),
|
63 |
+
dict(type='ToTensorOCR'),
|
64 |
+
dict(type='NormalizeOCR', **img_norm_cfg),
|
65 |
+
dict(
|
66 |
+
type='Collect',
|
67 |
+
keys=['img'],
|
68 |
+
meta_keys=[
|
69 |
+
'filename', 'ori_shape', 'img_shape', 'text', 'valid_ratio',
|
70 |
+
'resize_shape'
|
71 |
+
]),
|
72 |
+
]
|
73 |
+
test_pipeline = [
|
74 |
+
dict(type='LoadImageFromFile'),
|
75 |
+
dict(
|
76 |
+
type='MultiRotateAugOCR',
|
77 |
+
rotate_degrees=[0, 90, 270],
|
78 |
+
transforms=[
|
79 |
+
dict(
|
80 |
+
type='ResizeOCR',
|
81 |
+
height=32,
|
82 |
+
min_width=128,
|
83 |
+
max_width=128,
|
84 |
+
keep_aspect_ratio=False,
|
85 |
+
width_downsample_ratio=0.25),
|
86 |
+
dict(type='ToTensorOCR'),
|
87 |
+
dict(type='NormalizeOCR', **img_norm_cfg),
|
88 |
+
dict(
|
89 |
+
type='Collect',
|
90 |
+
keys=['img'],
|
91 |
+
meta_keys=[
|
92 |
+
'filename', 'ori_shape', 'img_shape', 'valid_ratio',
|
93 |
+
'resize_shape', 'img_norm_cfg', 'ori_filename'
|
94 |
+
]),
|
95 |
+
])
|
96 |
+
]
|
configs/_base_/recog_pipelines/crnn_pipeline.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
img_norm_cfg = dict(mean=[127], std=[127])
|
2 |
+
|
3 |
+
train_pipeline = [
|
4 |
+
dict(type='LoadImageFromFile', color_type='grayscale'),
|
5 |
+
dict(
|
6 |
+
type='ResizeOCR',
|
7 |
+
height=32,
|
8 |
+
min_width=100,
|
9 |
+
max_width=100,
|
10 |
+
keep_aspect_ratio=False),
|
11 |
+
dict(type='Normalize', **img_norm_cfg),
|
12 |
+
dict(type='DefaultFormatBundle'),
|
13 |
+
dict(
|
14 |
+
type='Collect',
|
15 |
+
keys=['img'],
|
16 |
+
meta_keys=['filename', 'resize_shape', 'text', 'valid_ratio']),
|
17 |
+
]
|
18 |
+
test_pipeline = [
|
19 |
+
dict(type='LoadImageFromFile', color_type='grayscale'),
|
20 |
+
dict(
|
21 |
+
type='ResizeOCR',
|
22 |
+
height=32,
|
23 |
+
min_width=32,
|
24 |
+
max_width=None,
|
25 |
+
keep_aspect_ratio=True),
|
26 |
+
dict(type='Normalize', **img_norm_cfg),
|
27 |
+
dict(type='DefaultFormatBundle'),
|
28 |
+
dict(
|
29 |
+
type='Collect',
|
30 |
+
keys=['img'],
|
31 |
+
meta_keys=[
|
32 |
+
'filename', 'resize_shape', 'valid_ratio', 'img_norm_cfg',
|
33 |
+
'ori_filename', 'img_shape', 'ori_shape'
|
34 |
+
]),
|
35 |
+
]
|
configs/_base_/recog_pipelines/crnn_tps_pipeline.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
img_norm_cfg = dict(mean=[0.5], std=[0.5])
|
2 |
+
|
3 |
+
train_pipeline = [
|
4 |
+
dict(type='LoadImageFromFile', color_type='grayscale'),
|
5 |
+
dict(
|
6 |
+
type='ResizeOCR',
|
7 |
+
height=32,
|
8 |
+
min_width=100,
|
9 |
+
max_width=100,
|
10 |
+
keep_aspect_ratio=False),
|
11 |
+
dict(type='ToTensorOCR'),
|
12 |
+
dict(type='NormalizeOCR', **img_norm_cfg),
|
13 |
+
dict(
|
14 |
+
type='Collect',
|
15 |
+
keys=['img'],
|
16 |
+
meta_keys=[
|
17 |
+
'filename', 'ori_shape', 'resize_shape', 'text', 'valid_ratio'
|
18 |
+
]),
|
19 |
+
]
|
20 |
+
test_pipeline = [
|
21 |
+
dict(type='LoadImageFromFile', color_type='grayscale'),
|
22 |
+
dict(
|
23 |
+
type='ResizeOCR',
|
24 |
+
height=32,
|
25 |
+
min_width=32,
|
26 |
+
max_width=100,
|
27 |
+
keep_aspect_ratio=False),
|
28 |
+
dict(type='ToTensorOCR'),
|
29 |
+
dict(type='NormalizeOCR', **img_norm_cfg),
|
30 |
+
dict(
|
31 |
+
type='Collect',
|
32 |
+
keys=['img'],
|
33 |
+
meta_keys=[
|
34 |
+
'filename', 'ori_shape', 'resize_shape', 'valid_ratio',
|
35 |
+
'img_norm_cfg', 'ori_filename', 'img_shape'
|
36 |
+
]),
|
37 |
+
]
|
configs/_base_/recog_pipelines/master_pipeline.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
img_norm_cfg = dict(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
|
2 |
+
train_pipeline = [
|
3 |
+
dict(type='LoadImageFromFile'),
|
4 |
+
dict(
|
5 |
+
type='ResizeOCR',
|
6 |
+
height=48,
|
7 |
+
min_width=48,
|
8 |
+
max_width=160,
|
9 |
+
keep_aspect_ratio=True),
|
10 |
+
dict(type='ToTensorOCR'),
|
11 |
+
dict(type='NormalizeOCR', **img_norm_cfg),
|
12 |
+
dict(
|
13 |
+
type='Collect',
|
14 |
+
keys=['img'],
|
15 |
+
meta_keys=[
|
16 |
+
'filename', 'ori_shape', 'img_shape', 'text', 'valid_ratio',
|
17 |
+
'resize_shape'
|
18 |
+
]),
|
19 |
+
]
|
20 |
+
test_pipeline = [
|
21 |
+
dict(type='LoadImageFromFile'),
|
22 |
+
dict(
|
23 |
+
type='MultiRotateAugOCR',
|
24 |
+
rotate_degrees=[0, 90, 270],
|
25 |
+
transforms=[
|
26 |
+
dict(
|
27 |
+
type='ResizeOCR',
|
28 |
+
height=48,
|
29 |
+
min_width=48,
|
30 |
+
max_width=160,
|
31 |
+
keep_aspect_ratio=True),
|
32 |
+
dict(type='ToTensorOCR'),
|
33 |
+
dict(type='NormalizeOCR', **img_norm_cfg),
|
34 |
+
dict(
|
35 |
+
type='Collect',
|
36 |
+
keys=['img'],
|
37 |
+
meta_keys=[
|
38 |
+
'filename', 'ori_shape', 'img_shape', 'valid_ratio',
|
39 |
+
'img_norm_cfg', 'ori_filename', 'resize_shape'
|
40 |
+
]),
|
41 |
+
])
|
42 |
+
]
|
configs/_base_/recog_pipelines/nrtr_pipeline.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
2 |
+
train_pipeline = [
|
3 |
+
dict(type='LoadImageFromFile'),
|
4 |
+
dict(
|
5 |
+
type='ResizeOCR',
|
6 |
+
height=32,
|
7 |
+
min_width=32,
|
8 |
+
max_width=160,
|
9 |
+
keep_aspect_ratio=True,
|
10 |
+
width_downsample_ratio=0.25),
|
11 |
+
dict(type='ToTensorOCR'),
|
12 |
+
dict(type='NormalizeOCR', **img_norm_cfg),
|
13 |
+
dict(
|
14 |
+
type='Collect',
|
15 |
+
keys=['img'],
|
16 |
+
meta_keys=[
|
17 |
+
'filename', 'ori_shape', 'resize_shape', 'text', 'valid_ratio'
|
18 |
+
]),
|
19 |
+
]
|
20 |
+
|
21 |
+
test_pipeline = [
|
22 |
+
dict(type='LoadImageFromFile'),
|
23 |
+
dict(
|
24 |
+
type='ResizeOCR',
|
25 |
+
height=32,
|
26 |
+
min_width=32,
|
27 |
+
max_width=160,
|
28 |
+
keep_aspect_ratio=True),
|
29 |
+
dict(type='ToTensorOCR'),
|
30 |
+
dict(type='NormalizeOCR', **img_norm_cfg),
|
31 |
+
dict(
|
32 |
+
type='Collect',
|
33 |
+
keys=['img'],
|
34 |
+
meta_keys=[
|
35 |
+
'filename', 'ori_shape', 'resize_shape', 'valid_ratio',
|
36 |
+
'img_norm_cfg', 'ori_filename', 'img_shape'
|
37 |
+
])
|
38 |
+
]
|