PatentSolver

Build error

App Files Files Community

RedBaron5

xin commited on Jan 5, 2023

Commit

c13b805

•

0 Parent(s):

Duplicate from xin/PatentSolver

Browse files

Co-authored-by: nixin <[email protected]>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +29 -0
App/__init__.py +0 -0
App/__pycache__/__init__.cpython-38.pyc +3 -0
App/assets/FakeConcept +2 -0
App/assets/FilterS +3 -0
App/assets/FilterWords +7 -0
App/assets/Incrementer +4 -0
App/assets/OneCommaDiscriminator +11 -0
App/assets/abbreviation_sentence_splitter +24 -0
App/assets/claims_indices +1 -0
App/assets/commonWords +0 -0
App/assets/dicts/dec.yml +2 -0
App/assets/dicts/inc.yml +4 -0
App/assets/dicts/inv.yml +6 -0
App/assets/dicts/negative.yml +102 -0
App/assets/dicts/positive.yml +113 -0
App/assets/dropPart +39 -0
App/assets/examplificationclues +8 -0
App/assets/exclude_from_parameters +22 -0
App/assets/exclusionList +73 -0
App/assets/getFromClaims +1 -0
App/assets/includeLinks +7 -0
App/assets/inclusionList +17 -0
App/assets/parameter_core +237 -0
App/assets/problem_markers +151 -0
App/assets/referencing_indices +4 -0
App/assets/removeItems +22 -0
App/assets/stopword_common_English +500 -0
App/assets/stopword_patent_English +38 -0
App/assets/stopword_rake.txt +590 -0
App/assets/stopwords +567 -0
App/assets/trainingsNegative +216 -0
App/assets/trainingsPositive +213 -0
App/assets/wordAfterNumber +95 -0
App/assets/wordBeforeNumber +20 -0
App/assets/wordtagVerb +2 -0
App/bin/ClassifierWithIncr.py +189 -0
App/bin/ComplexParser.py +45 -0
App/bin/CorpusProcessor.py +460 -0
App/bin/FiguresCleaner.py +44 -0
App/bin/FindTechnologies.py +64 -0
App/bin/InformationExtractor.py +588 -0
App/bin/InformationExtractor_Claims.py +165 -0
App/bin/InputHandler.py +35 -0
App/bin/MagicParser.py +45 -0
App/bin/PGProcessor.py +107 -0
App/bin/ParamProcessor.py +99 -0
App/bin/ParameterExtractor.py +51 -0
App/bin/PatentHandler.py +254 -0
App/bin/SentenceClassifier.py +60 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,29 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text
+*.pyc filter=lfs diff=lfs merge=lfs -text

App/__init__.py ADDED Viewed

File without changes

App/__pycache__/__init__.cpython-38.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ecebdb48cd6c7c6738e1a46ec4a92320a1e39163fdf8b688b16d9101dc77606
+size 161

App/assets/FakeConcept ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ such as
2	+ as such

App/assets/FilterS ADDED Viewed

	@@ -0,0 +1,3 @@

+figure
+figures
+previous

App/assets/FilterWords ADDED Viewed

	@@ -0,0 +1,7 @@

+if
+claim 1
+claim 2
+claim 3
+however
+but
+cause

App/assets/Incrementer ADDED Viewed

	@@ -0,0 +1,4 @@

+approximately
+suitable
+good
+preferably

App/assets/OneCommaDiscriminator ADDED Viewed

	@@ -0,0 +1,11 @@

+preferably,\s
+in addition
+which
+thus
+preferably
+but
+generally
+conventional
+in particular
+specifically
+as necessary

App/assets/abbreviation_sentence_splitter ADDED Viewed

	@@ -0,0 +1,24 @@

+vol
+no
+pp
+p
+ch
+pat
+i.e
+fig
+vdd
+p
+sec
+centigrade
+vols
+figs
+approx
+e.g
+etc
+cf
+ser
+\]
+deg
+ver
+sup
+A

App/assets/claims_indices ADDED Viewed

	@@ -0,0 +1 @@


1	+ ^.\bclaim [1-3]\b\s?,?(.$)

App/assets/commonWords ADDED Viewed

The diff for this file is too large to render. See raw diff

App/assets/dicts/dec.yml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ barely: [dec]
2	+ little: [dec]

App/assets/dicts/inc.yml ADDED Viewed

	@@ -0,0 +1,4 @@

+too: [inc]
+very: [inc]
+sorely: [inc]
+is: [inc]

App/assets/dicts/inv.yml ADDED Viewed

	@@ -0,0 +1,6 @@

+increase: [inv]
+decrease: [inv]
+increases: [inv]
+decreases: [inv]
+increased: [inv]
+decreased: [inv]

App/assets/dicts/negative.yml ADDED Viewed

	@@ -0,0 +1,102 @@

+bad: [negative]
+uninspired: [negative]
+expensive: [negative]
+dissapointed: [negative]
+recommend others to avoid: [negative]
+do not: [negative]
+costly: [negative]
+drawbacks: [negative]
+drawbacks: [negative]
+problem: [negative]
+uneven: [negative]
+perturbation: [negative]
+may not: [negative]
+is not: [negative]
+are not: [negative]
+fragil: [negative]
+damage: [negative]
+prevent: [negative]
+can vary: [negative]
+can not: [negative]
+cannot: [negative]
+due to: [negative]
+thus no longer: [negative]
+no longer: [negative]
+annoying: [negative]
+annoy: [negative]
+annoys: [negative]
+annoyed: [negative]
+saturated: [negative]
+undesirable: [negative]
+pollute: [negative]
+pollutes: [negative]
+saturates: [negative]
+saturating: [negative]
+saturate: [negative]
+blocked: [negative]
+dirt: [negative]
+dirts: [negative]
+complexity: [negative]
+tiresome: [negative]
+does not: [negative]
+abnormally: [negative]
+abruptly: [negative]
+critically: [negative]
+dangerously: [negative]
+disadvantages: [negative]
+disadvantage: [negative]
+dramatically: [negative]
+erroneously: [negative]
+exceedingly: [negative]
+excessively: [negative]
+hardly: [negative]
+heavily: [negative]
+irreversibly: [negative]
+poorly: [negative]
+randomly: [negative]
+severely: [negative]
+unacceptably: [negative]
+unconditionally: [negative]
+unevenly: [negative]
+unexpectedly: [negative]
+unfortunately: [negative]
+unusually: [negative]
+uselessly: [negative]
+badly: [negative]
+impair: [negative]
+increasing: [negative]
+weight: [negative]
+costs: [negative]
+cost: [negative]
+is not: [negative]
+damage: [negative]
+complicate: [negative]
+destroy: [negative]
+was necessary: [negative]
+annoy: [negative]
+annoys: [negative]
+annoyed: [negative]
+annoying: [negative]
+noise: [negative]
+inevitably: [negative]
+be annoying: [negative]
+is annoying: [negative]
+difficulties: [negative]
+difficulty: [negative]
+difficult: [negative]
+fatty: [negative]
+prevent: [negative]
+leakage: [negative]
+corrosive: [negative]
+require: [negative]
+deformation: [negative]
+necessity: [negative]
+error: [negative]
+errors: [negative]
+occur: [negative]
+occurs: [negative]
+may nevertheless: [negative]
+may not: [negative]
+continue to increase: [negative]
+decrease: [negative]

App/assets/dicts/positive.yml ADDED Viewed

	@@ -0,0 +1,113 @@

+nice: [positive]
+awesome: [positive]
+cool: [positive]
+superb: [positive]
+comprises: [positive]
+is: [positive]
+advantageously: [positive]
+advantageous: [positive]
+preferably: [positive]
+advantage: [positive]
+advantages: [positive]
+avoid: [positive]
+useful: [positve]
+good: [positive]
+suitable: [positive]
+be: [positive]
+improve: [positive]
+improved: [positive]
+improves: [positive]
+does not have to: [positive]
+removed: [positive]
+acceptably: [positive]
+accurately: [positive]
+acutely: [positive]
+adequately: [positive]
+advantageously: [positive]
+appreciably: [positive]
+beneficially: [positive]
+brightly: [positive]
+cheaply: [positive]
+clearly: [positive]
+conveniently: [positive]
+correctly: [positive]
+cost-effectively: [positive]
+desirably: [positive]
+effectively: [positive]
+effectually: [positive]
+efficaciously: [positive]
+efficiently: [positive]
+elaborately: [positive]
+favourably: [positive]
+finely: [positive]
+greatly: [positive]
+inexpensively: [positive]
+interestingly: [positive]
+measurably: [positive]
+necessarily: [positive]
+determines: [positive]
+determine: [positive]
+optimally: [positive]
+potentially: [positive]
+precisely: [positive]
+properly: [positive]
+purposefully: [positive]
+qualitatively: [positive]
+quantitatively: [positive]
+quite: [positive]
+rapidly: [positive]
+safely: [positive]
+satisfactorily: [positive]
+securely: [positive]
+sharply: [positive]
+uniform: [positive]
+distribution: [positive]
+reduce: [positive]
+reduced: [positive]
+reduces: [positive]
+desire: [positive]
+desired: [positive]
+desires: [positive]
+have: [positive]
+has: [positive]
+be: [positive]
+about: [positive]
+drive: [positive]
+drives: [positive]
+generally: [positive]
+accomplish: [positive]
+accomplished: [positive]
+consists: [positive]
+consist: [positive]
+include: [positive]
+includes: [positive]
+remove: [positive]
+removes: [positive]
+present: [positive]
+presents: [positive]
+separates: [positive]
+connected: [positive]
+enclosed: [positive]
+with respect: [positive]
+disconnected: [positive]
+relevance: [positive]
+extend: [positive]
+enclose: [positive]
+create: [positive]
+creates: [positive]
+so that: [positive]
+well known: [positive]
+contain: [positive]
+contains: [positive]
+receive: [positive]
+receives: [positive]
+processes: [positive]
+processe: [positive]
+optimal: [positive]
+inexpensive: [positive]
+operates: [positive]
+operate: [positive]
+minimize: [positive]
+can not therefore invade: [positive]
+installed: [positive]
+install: [positive]

App/assets/dropPart ADDED Viewed

	@@ -0,0 +1,39 @@

+DETAILED DESCRIPTION OF THE INVENTION
+BRIEF DESCRIPTION OF DRAWINGS
+BACKGROUND OF INVENTION
+BACKGROUND OF THE INVENTION
+FIELD OF INVENTION
+BRIEF DESCRIPTION OF THE DRAWINGS
+Brief Description of the Drawings
+Brief Description of the Prior Art
+DETAILED DESCRIPTION OF THE PREFERRED EMBODIMENTS
+Object of the Invention
+DETALIED DESCRIPTION
+Field of the Invention
+Background Art
+Background art
+Object of the Invention
+DESCRIPTION
+BACKGROUND
+DRAWINGS
+SUMMARY
+Brief Description of the Prior Art
+detailed description of the invention and preferred embodiments
+detailed description of the preferred embodiments
+detailed description of the preferred embodiment
+detailed description
+background of the invention
+Background of the Invention
+detailed description of the invention
+summary of the inventions
+summary of the invention
+background and summary of the invention
+field of the invention
+description of the related art
+related background art
+description of the prior art
+field of the invention
+brief description of the invention
+description of the invention
+description of the preferred embodiments.
+what is claimed is

App/assets/examplificationclues ADDED Viewed

	@@ -0,0 +1,8 @@

+moreover,\s
+in addition,\s
+in pratice,\s
+lastly,\s
+such a\s
+such\s
+typically,\s
+in this example,\s

App/assets/exclude_from_parameters ADDED Viewed

	@@ -0,0 +1,22 @@

+invention
+comprising
+that
+initiation
+between
+investigation
+investigations
+example
+illustration
+examples
+illustrations
+thereof
+increase
+decrease
+under
+problem
+as
+parison
+regardless
+by
+through
+another

App/assets/exclusionList ADDED Viewed

	@@ -0,0 +1,73 @@

+figure
+figures
+shows
+;
+wherein
+the present invention
+list of reference characters
+brief description of the drawings
+detailed description of preferred embodiments
+description of a preferred embodiment
+:
+for example
+this
+drawing
+drawings
+embodiments
+embodiment
+arrangement
+arrangements
+invention
+inventions
+provisional application
+provisional patent application
+method
+present application
+international application
+reference characters
+description
+descriptions
+in addition
+likewise
+figure
+Figure
+Figures
+figures
+comprise
+comprises
+comprising
+Another purpose
+related patent
+DETAILED
+OVERVIEW
+summary
+present disclosure
+cross-references
+cross-reference
+disclosed herein
+aforesaid document
+describes
+describe
+relates
+relate
+present inventive
+Description of the Related Art
+publication
+Publication
+referred
+we think
+thereafter
+Apparatus
+apparatus
+what is claimed is
+What is claimed is
+herein
+Herein
+will not be presented more in details
+discloses
+previous
+this
+This
+table
+Table
+TABLE

App/assets/getFromClaims ADDED Viewed

	@@ -0,0 +1 @@


1	+ wherein

App/assets/includeLinks ADDED Viewed

	@@ -0,0 +1,7 @@

+However
+however
+If
+Because
+When\b
+Since
+since

App/assets/inclusionList ADDED Viewed

	@@ -0,0 +1,17 @@

+however
+may
+if
+are
+advantageously
+may be
+may cause
+cause
+causes
+is
+when
+since
+in order to
+in practice
+many
+Many
+problem

App/assets/parameter_core ADDED Viewed

	@@ -0,0 +1,237 @@

+ability
+absorption
+accent
+accuracy
+adaptability
+air pressure
+albedo
+amount
+anatomy
+area
+arena
+atmospheric pressure
+automation
+bar
+baron
+beat
+bod
+boiling point
+book
+breadth
+brittleness
+broker
+build
+bulk
+cadence
+capacitance
+catching
+charge
+chassis
+cistron
+color
+comfort
+commission
+complexity
+complexness
+concentration
+condition
+configuration
+conformation
+constancy
+constituent
+content
+continuance
+contour
+density
+departure
+dependability
+dependableness
+deprivation
+dielectric constant
+difficultness
+difficulty
+direction
+distance
+distribution
+divisor
+domain
+ductility
+durability
+duration
+ease
+easiness
+effect
+effectiveness
+efficacy
+electric charge
+electric field
+electric potential
+electrical conductivity
+electrical impedance
+electrical resistivity
+emission
+emphasis
+enduringness
+energy
+espial
+essence
+exercising weight
+exit
+expanse
+expiration
+exponent
+extent
+fabrication
+factor
+factors
+fastness
+field
+flesh
+flexibility
+flow
+flow rate
+flowing
+fluidity
+focus
+force
+force-out
+forcefulness
+form
+frequency
+gist
+guidance
+hardness
+height
+hurrying
+illuminance
+illumination
+index
+inductance
+informality
+ingredient
+insistence
+insistency
+instruction
+intensity
+intensiveness
+intrinsic impedance
+inwardness
+irradiance
+issue
+kernel
+king
+length
+level
+light
+location
+loss
+loudness
+luminance
+luster
+magnate
+magnetic field
+magnetic flux
+malleability
+management
+manufacture
+manufacturing
+marrow
+mass
+material body
+meaning
+measure
+measurement
+measuring
+melting point
+menses
+menstruation
+mensuration
+meter
+metre
+mightiness
+moment
+momentum
+muscularity
+number
+office
+orbit
+passing
+pattern
+period
+permeability
+permittivity
+physical body
+physique
+pith
+point
+posture
+potency
+power
+powerfulness
+preciseness
+precision
+pressure
+productiveness
+productivity
+push
+quantity
+radiance
+rate
+rate of flow
+ratio
+reflectivity
+region
+release
+reliability
+reliableness
+resistivity
+saturation
+shape
+simpleness
+simplicity
+solubility
+speciality
+specialty
+specific heat
+speed
+speeding
+sphere
+spin
+spotting
+spying
+stability
+stableness
+standard
+steering
+step
+strain
+stream
+strength
+stress
+substance
+surface
+swiftness
+temperature
+tenseness
+tension
+thermal conductivity
+touchstone
+trouble
+truth
+tycoon
+velocity
+versatility
+vigor
+vigour
+violence
+viscosity
+vitality
+vividness
+voltage
+volume
+wave impedance
+weight
+weightiness
+weighting
+weights
+width

App/assets/problem_markers ADDED Viewed

	@@ -0,0 +1,151 @@

+would
+may
+cause without present invention or method etc
+conventional
+problem
+turbulence
+polluting
+aggravate
+uneven
+irregular
+pertubation
+disturbance
+dissipation
+affect
+noise-inducing
+dirt
+undesirable
+pollute
+contaminant
+difficulty
+difficult
+rancid
+hazard
+time-consuming
+painstaking
+byproduct
+leak
+leakage
+blemish
+blemished
+blemishes
+blemishing
+blemishings
+break
+breaking
+breakings
+breaks
+broken
+bug
+bugs
+cause
+caused
+causes
+causing
+complicate
+complicated
+complicates
+complicating
+complication
+crack
+cracked
+cracking
+crackings
+critical
+damage
+damaged
+damages
+damaging
+defect
+defected
+defecting
+defection
+defections
+defects
+deficiencies
+deficiency
+deform
+deformed
+deformities
+deformity
+degradation
+degrade
+degraded
+degrades
+deprivation
+deprive
+deprived
+deprives
+depriving
+destroy
+destroyed
+destroying
+destroys
+destruction
+deteriorate
+deteriorates
+deteriorating
+deteriored
+detriment
+difficulties
+difficulty
+difficult
+disadvantage
+disparates
+drawbacks
+grave
+hard
+hamper
+hampered
+hampering
+hampers
+harm
+harmed
+harming
+harms
+hinder
+impair
+impaired
+impairing
+impairs
+imperfection
+imperfections
+incident
+instabilities
+instability
+mar
+marring
+prejudice
+problem
+problems
+serious
+severe
+smashes
+smashing
+spoil
+spoiling
+stain
+stains
+trouble
+troubled
+troubles
+weaken
+failure
+unintended
+limitation
+limitations
+drawback
+weakened
+degrading
+undesired
+weakening
+disadvantageous
+weakness
+weaknesses
+worse
+worseing
+worsen
+worsened
+worsens

App/assets/referencing_indices ADDED Viewed

	@@ -0,0 +1,4 @@

+explained before
+but as
+most of
+^if

App/assets/removeItems ADDED Viewed

	@@ -0,0 +1,22 @@

+brief description of the drawings
+detailed description of preferred embodiments
+description of a preferred embodiment
+detailed description of the invention and preferred embodiments
+detailed description of the preferred embodiments
+detailed description of the preferred embodiment
+detailed description
+background of the invention
+detailed description of the invention
+summary of the inventions
+summary of the invention
+background and summary of the invention
+field of the invention
+description of the related art
+related background art
+description of the prior art
+field of the invention
+brief description of the invention
+description of the invention
+description of the preferred embodiments.
+summary
+mitsubishi heavy ind ltd [jp]

App/assets/stopword_common_English ADDED Viewed

	@@ -0,0 +1,500 @@

+a
+able
+about
+above
+abst
+accordance
+according
+accordingly
+across
+act
+actually
+added
+adj
+affected
+affecting
+affects
+after
+afterwards
+again
+against
+ah
+all
+almost
+alone
+along
+already
+also
+although
+always
+am
+among
+amongst
+an
+and
+announce
+another
+any
+anybody
+anyhow
+anymore
+anyone
+anything
+anyway
+anyways
+anywhere
+apparently
+approximately
+are
+aren
+arent
+arise
+around
+as
+aside
+ask
+asking
+at
+auth
+available
+away
+awfully
+b
+back
+be
+became
+because
+become
+becomes
+becoming
+been
+before
+beforehand
+begin
+beginning
+beginnings
+begins
+behind
+being
+believe
+below
+beside
+besides
+between
+beyond
+biol
+both
+brief
+briefly
+but
+by
+c
+ca
+came
+can
+cannot
+can't
+cause
+causes
+certain
+certainly
+co
+com
+come
+comes
+contain
+containing
+contains
+could
+couldnt
+d
+date
+did
+didn't
+different
+do
+does
+doesn't
+doing
+done
+don't
+down
+downwards
+due
+during
+e
+each
+ed
+edu
+effect
+eg
+eight
+eighty
+either
+else
+elsewhere
+end
+ending
+enough
+especially
+et
+et-al
+etc
+even
+ever
+every
+everybody
+everyone
+everything
+everywhere
+ex
+except
+f
+far
+few
+ff
+fifth
+first
+five
+fix
+followed
+following
+follows
+for
+former
+formerly
+forth
+found
+four
+from
+further
+furthermore
+g
+gave
+get
+gets
+getting
+give
+given
+gives
+giving
+go
+goes
+gone
+got
+gotten
+h
+had
+happens
+hardly
+has
+hasn't
+have
+haven't
+having
+he
+hed
+hence
+her
+here
+hereafter
+hereby
+herein
+heres
+hereupon
+hers
+herself
+hes
+hi
+hid
+him
+himself
+his
+hither
+home
+how
+howbeit
+however
+hundred
+i
+id
+ie
+if
+i'll
+im
+immediate
+immediately
+importance
+important
+in
+inc
+indeed
+index
+information
+instead
+into
+invention
+inward
+is
+isn't
+it
+itd
+it'll
+its
+itself
+i've
+j
+just
+k
+keep 	keeps
+kept
+kg
+km
+know
+known
+knows
+l
+largely
+last
+lately
+later
+latter
+latterly
+least
+less
+lest
+let
+lets
+like
+liked
+likely
+line
+little
+'ll
+look
+looking
+looks
+ltd
+m
+made
+mainly
+make
+makes
+many
+may
+maybe
+me
+mean
+means
+meantime
+meanwhile
+merely
+mg
+might
+million
+miss
+ml
+more
+moreover
+most
+mostly
+mr
+mrs
+much
+mug
+must
+my
+myself
+n
+na
+name
+namely
+nay
+nd
+near
+nearly
+necessarily
+necessary
+need
+needs
+neither
+never
+nevertheless
+new
+next
+nine
+ninety
+no
+nobody
+non
+none
+nonetheless
+noone
+nor
+normally
+nos
+not
+noted
+nothing
+now
+nowhere
+o
+obtain
+obtained
+obviously
+of
+off
+often
+oh
+ok
+okay
+old
+omitted
+on
+once
+one
+ones
+only
+onto
+or
+ord
+other
+others
+otherwise
+ought
+our
+ours
+ourselves
+out
+outside
+over
+overall
+owing
+own
+p
+page
+pages
+part
+particular
+particularly
+past
+per
+perhaps
+placed
+please
+plus
+poorly
+possible
+possibly
+potentially
+pp
+predominantly
+present
+previously
+primarily
+probably
+promptly
+proud
+provides
+put
+q
+que
+quickly
+quite
+qv
+r
+ran
+rather
+rd
+re
+readily
+really
+recent
+recently
+ref
+refs
+regarding
+regardless
+regards
+related
+relatively
+research
+respectively
+resulted
+resulting
+results
+right
+run
+s
+said
+same
+saw
+say
+saying
+says
+sec
+section
+see
+seeing
+seem
+seemed
+seeming
+seems
+seen
+self
+selves
+sent
+seven
+several
+shall
+she
+shed
+she'll
+shes
+should
+shouldn't
+show
+showed
+shown
+showns
+shows
+significant
+significantly
+similar
+similarly
+since
+six
+slightly
+so
+some
+somebody
+somehow
+someone
+somethan
+something
+sometime
+sometimes
+somewhat
+somewhere
+soon
+sorry
+specifically
+specified
+specify
+specifying
+still
+stop
+strongly
+sub
+substantially
+successfully
+such
+sufficiently
+suggest
+sup
+sure
+invention
+method

App/assets/stopword_patent_English ADDED Viewed

	@@ -0,0 +1,38 @@

+scope
+about
+contiguous
+device for
+dispose
+include
+furtherheretofore
+indicium
+means
+member
+multitude
+pivotability
+whereby
+such that
+thereby
+wherein
+surrounding
+substantially
+so that
+thereof
+system
+process
+invention
+a kind of
+equipment
+belong to
+according to
+characterized in that
+provide
+get
+characterized in that
+handle
+achieve
+form
+used for
+utilize
+adopt
+get

App/assets/stopword_rake.txt ADDED Viewed

	@@ -0,0 +1,590 @@

+#stop word list from SMART (Salton,1971).  Available at ftp://ftp.cs.cornell.edu/pub/smart/english.stop
+a
+a's
+able
+about
+above
+according
+accordingly
+across
+actually
+after
+afterwards
+again
+against
+ain't
+all
+allow
+allows
+almost
+alone
+along
+already
+also
+although
+always
+am
+among
+amongst
+an
+and
+another
+any
+anybody
+anyhow
+anyone
+anything
+anyway
+anyways
+anywhere
+apart
+appear
+appreciate
+appropriate
+are
+aren't
+around
+as
+aside
+ask
+asking
+associated
+at
+available
+away
+awfully
+b
+be
+became
+because
+become
+becomes
+becoming
+been
+before
+beforehand
+behind
+being
+believe
+below
+beside
+besides
+best
+better
+between
+beyond
+both
+brief
+but
+by
+c
+c'mon
+c's
+came
+can
+can't
+cannot
+cant
+cause
+causes
+certain
+certainly
+changes
+clearly
+co
+com
+come
+comes
+concerning
+consequently
+consider
+considering
+contain
+containing
+contains
+corresponding
+could
+couldn't
+course
+currently
+d
+definitely
+described
+despite
+did
+didn't
+different
+do
+does
+doesn't
+doing
+don't
+done
+down
+downwards
+during
+e
+each
+edu
+eg
+eight
+either
+else
+elsewhere
+enough
+entirely
+especially
+et
+etc
+even
+ever
+every
+everybody
+everyone
+everything
+everywhere
+ex
+exactly
+example
+except
+f
+far
+few
+fifth
+first
+five
+followed
+following
+follows
+for
+former
+formerly
+forth
+four
+from
+further
+furthermore
+g
+get
+gets
+getting
+given
+gives
+go
+goes
+going
+gone
+got
+gotten
+greetings
+h
+had
+hadn't
+happens
+hardly
+has
+hasn't
+have
+haven't
+having
+he
+he's
+hello
+help
+hence
+her
+here
+here's
+hereafter
+hereby
+herein
+hereupon
+hers
+herself
+hi
+him
+himself
+his
+hither
+hopefully
+how
+howbeit
+however
+i
+i'd
+i'll
+i'm
+i've
+ie
+if
+ignored
+immediate
+in
+inasmuch
+inc
+indeed
+indicate
+indicated
+indicates
+inner
+insofar
+instead
+into
+inward
+is
+isn't
+it
+it'd
+it'll
+it's
+its
+itself
+j
+just
+k
+keep
+keeps
+kept
+know
+knows
+known
+l
+last
+lately
+later
+latter
+latterly
+least
+less
+lest
+let
+let's
+like
+liked
+likely
+little
+look
+looking
+looks
+ltd
+m
+mainly
+many
+may
+maybe
+me
+mean
+meanwhile
+merely
+might
+more
+moreover
+most
+mostly
+much
+must
+my
+myself
+n
+name
+namely
+nd
+near
+nearly
+necessary
+need
+needs
+neither
+never
+nevertheless
+new
+next
+nine
+no
+nobody
+non
+none
+noone
+nor
+normally
+not
+nothing
+novel
+now
+nowhere
+o
+obviously
+of
+off
+often
+oh
+ok
+okay
+old
+on
+once
+one
+ones
+only
+onto
+or
+other
+others
+otherwise
+ought
+our
+ours
+ourselves
+out
+outside
+over
+overall
+own
+p
+particular
+particularly
+per
+perhaps
+placed
+please
+plus
+possible
+presumably
+probably
+provides
+q
+que
+quite
+qv
+r
+rather
+present
+rd
+wherein
+comprises
+device
+method
+disclosure
+comprising
+providing
+including
+re
+really
+reasonably
+regarding
+regardless
+regards
+relatively
+respectively
+right
+s
+said
+assembly
+same
+saw
+say
+saying
+says
+second
+secondly
+see
+seeing
+seem
+seemed
+seeming
+seems
+seen
+self
+selves
+element
+sensible
+sent
+serious
+seriously
+seven
+several
+shall
+she
+should
+shouldn't
+since
+six
+so
+some
+somebody
+somehow
+someone
+something
+sometime
+sometimes
+somewhat
+somewhere
+soon
+sorry
+specified
+specify
+specifying
+still
+sub
+such
+sup
+sure
+t
+t's
+take
+taken
+tell
+tends
+th
+than
+thank
+thanks
+thanx
+that
+that's
+thats
+the
+their
+theirs
+them
+themselves
+then
+thence
+there
+there's
+thereafter
+thereby
+therefore
+therein
+theres
+thereupon
+these
+they
+they'd
+they'll
+they're
+they've
+think
+third
+this
+thorough
+thoroughly
+those
+though
+three
+through
+throughout
+thru
+thus
+to
+together
+too
+took
+toward
+towards
+tried
+tries
+truly
+try
+trying
+twice
+two
+u
+un
+under
+unfortunately
+unless
+unlikely
+until
+unto
+up
+upon
+us
+use
+used
+useful
+uses
+using
+usually
+uucp
+v
+value
+various
+very
+via
+viz
+vs
+w
+want
+wants
+was
+wasn't
+way
+we
+we'd
+we'll
+we're
+we've
+welcome
+well
+went
+were
+weren't
+what
+what's
+whatever
+when
+whence
+whenever
+where
+where's
+whereafter
+whereas
+whereby
+wherein
+whereupon
+wherever
+whether
+which
+while
+whither
+who
+who's
+whoever
+whole
+whom
+whose
+why
+will
+willing
+wish
+unit
+with
+comprising
+including
+within
+without
+won't
+wonder
+section
+would
+would
+wouldn't
+system
+amount
+comprise
+x
+y
+yes
+yet
+you
+you'd
+you'll
+you're
+you've
+your
+yours
+yourself
+yourselves
+z
+zero

App/assets/stopwords ADDED Viewed

	@@ -0,0 +1,567 @@

+'ll
+a
+a kind of
+able
+about
+above
+abst
+accordance
+according
+according to
+accordingly
+achieve
+across
+act
+actually
+added
+adj
+adopt
+affected
+affecting
+affects
+after
+afterwards
+again
+against
+ah
+all
+almost
+alone
+along
+already
+also
+although
+always
+am
+among
+amongst
+an
+and
+announce
+another
+any
+anybody
+anyhow
+anymore
+anyone
+anything
+anyway
+anyways
+anywhere
+apparently
+approximately
+are
+aren
+arent
+arise
+around
+as
+aside
+ask
+asking
+at
+auth
+available
+away
+awfully
+b
+back
+be
+became
+because
+become
+becomes
+becoming
+been
+before
+beforehand
+begin
+beginning
+beginnings
+begins
+behind
+being
+believe
+belong to
+below
+beside
+besides
+between
+beyond
+biol
+both
+brief
+briefly
+but
+by
+c
+ca
+came
+can
+can't
+cannot
+cause
+causes
+certain
+certainly
+characterized in that
+co
+com
+come
+comes
+contain
+containing
+contains
+contiguous
+could
+couldnt
+d
+comprises
+date
+device for
+did
+didn't
+different
+dispose
+do
+does
+doesn't
+doing
+don't
+done
+down
+downwards
+due
+during
+e
+each
+ed
+edu
+effect
+eg
+eight
+eighty
+either
+else
+elsewhere
+end
+ending
+enough
+equipment
+especially
+et
+et-al
+etc
+even
+ever
+every
+everybody
+everyone
+everything
+everywhere
+ex
+except
+f
+far
+few
+ff
+fifth
+first
+five
+fix
+followed
+following
+follows
+for
+form
+former
+formerly
+forth
+found
+four
+from
+further
+furtherheretofore
+furthermore
+g
+gave
+get
+gets
+getting
+give
+given
+gives
+giving
+go
+goes
+gone
+got
+gotten
+h
+had
+handle
+happens
+hardly
+has
+hasn't
+have
+haven't
+having
+he
+hed
+hence
+her
+here
+hereafter
+hereby
+herein
+heres
+hereupon
+hers
+herself
+hes
+hi
+hid
+him
+himself
+his
+hither
+home
+how
+howbeit
+however
+hundred
+i
+i'll
+i've
+id
+ie
+if
+im
+immediate
+immediately
+importance
+important
+in
+inc
+include
+indeed
+index
+indicium
+information
+instead
+into
+invention
+inward
+is
+isn't
+it
+it'll
+itd
+its
+itself
+j
+just
+k
+keep
+kept
+kg
+km
+know
+known
+knows
+l
+largely
+last
+lately
+later
+latter
+latterly
+least
+less
+lest
+let
+lets
+like
+liked
+likely
+line
+little
+look
+looking
+looks
+ltd
+m
+made
+mainly
+keeps
+make
+makes
+comprise
+comprises
+many
+may
+maybe
+me
+mean
+means
+meantime
+meanwhile
+member
+merely
+method
+mg
+might
+million
+miss
+ml
+more
+moreover
+most
+mostly
+mr
+mrs
+much
+mug
+multitude
+must
+my
+myself
+n
+na
+name
+namely
+nay
+nd
+near
+nearly
+necessarily
+necessary
+need
+needs
+neither
+never
+nevertheless
+new
+next
+nine
+ninety
+no
+nobody
+non
+none
+nonetheless
+noone
+nor
+normally
+nos
+not
+noted
+nothing
+now
+nowhere
+o
+obtain
+obtained
+obviously
+of
+off
+often
+oh
+ok
+okay
+old
+omitted
+on
+once
+one
+ones
+only
+onto
+or
+ord
+other
+others
+otherwise
+ought
+our
+ours
+ourselves
+out
+outside
+over
+overall
+owing
+own
+p
+page
+pages
+part
+particular
+particularly
+past
+per
+perhaps
+pivotability
+placed
+please
+plus
+poorly
+possible
+possibly
+potentially
+pp
+predominantly
+present
+previously
+primarily
+probably
+process
+promptly
+proud
+provide
+provides
+put
+q
+que
+quickly
+quite
+qv
+r
+ran
+rather
+rd
+re
+readily
+really
+recent
+recently
+ref
+refs
+regarding
+regardless
+regards
+related
+relatively
+research
+respectively
+resulted
+resulting
+results
+right
+run
+s
+said
+same
+saw
+say
+saying
+says
+scope
+sec
+section
+see
+seeing
+seem
+seemed
+seeming
+seems
+seen
+self
+selves
+sent
+seven
+several
+shall
+she
+she'll
+shed
+shes
+should
+shouldn't
+show
+showed
+shown
+showns
+shows
+significant
+significantly
+similar
+similarly
+since
+six
+slightly
+so
+so that
+some
+somebody
+somehow
+someone
+somethan
+something
+sometime
+sometimes
+somewhat
+somewhere
+soon
+sorry
+specifically
+specified
+specify
+specifying
+still
+stop
+strongly
+sub
+substantially
+successfully
+such
+such that
+sufficiently
+suggest
+sup
+sure
+surrounding
+system
+thereby
+thereof
+used for
+utilize
+whereby
+wherein
+u.s. pat
+pat
+patent
+present invention
+deg
+bottom thereof
+certain embodiments
+field
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+]
+[

App/assets/trainingsNegative ADDED Viewed

	@@ -0,0 +1,216 @@

+extremely low:problem
+creates undesirable:problem
+their lifetime is also limited to a few months:problem
+cooking creates undesirable by-products as smoke and odor that can pollute an inhabited airspace:problem
+it will be understood that some leakage may occur:problem
+becomes heavy:problem
+are not likely:problem
+is designed:problem
+could cause:problem
+may impair:problem
+do not:problem
+signs of wear:problem
+sign of wear:problem
+wear:problem
+blemish:problem
+break:problem
+bug:problem
+complicate:problem
+crack:problem
+damage:problem
+deflect:problem
+deform:problem
+usually face difficulties:problem
+degrade:problem
+the smoke particles are permanently trapped within the filter element because they are larger than the openings in the filter media:problem
+deprive:problem
+destroy:problem
+deteriorate:problem
+the disadvantage of a polarizing filter is that it withdraws observation the nondesired reflections but also of relevant information relating to the surface quality of the skin:problem
+disparate:problem
+fail:problem
+hamper:problem
+harm:problem
+hinder:problem
+impair:problem
+smash:problem
+spoil:problem
+stain:problem
+trouble:problem
+weaken:problem
+worsen:problem
+break:problem
+blemish:problem
+bug:problem
+cause:problem
+complication:problem
+crack:problem
+damage:problem
+defect:problem
+deficiency:problem
+deformity:problem
+degradation:problem
+deprivation:problem
+destruction:problem
+deterioration:problem
+detriment:problem
+difficulty:problem
+disadvantage:problem
+disadvantages:problem
+a consultation of important duration presents disadvantages for the patient like for the user:problem
+drawback:problem
+failure:problem
+flaw:problem
+hamper:problem
+harm:problem
+impairing:problem
+imperfection:problem
+instability:problem
+limitation:problem
+prejudice:problem
+problem:problem
+spoiling:problem
+stain:problem
+trouble:problem
+weakness:problem
+difficut:problem
+worse:problem
+abnormal:problem
+abolish:problem
+abominable:problem
+abominably:problem
+abominate:problem
+abomination:problem
+abort:problem
+aborted:problem
+aborts:problem
+abrade:problem
+abrasive:problem
+abrupt:problem
+abruptly:problem
+abscond:problem
+absence:problem
+absent-minded:problem
+absentee:problem
+absurd:problem
+absurdity:problem
+absurdly:problem
+absurdness:problem
+abuse:problem
+abused:problem
+abuses:problem
+abusive:problem
+abysmal:problem
+abysmally:problem
+abyss:problem
+accidental:problem
+accost:problem
+accursed:problem
+accusation:problem
+accusations:problem
+accuse:problem
+accuses:problem
+accusing:problem
+accusingly:problem
+acerbate:problem
+acerbic:problem
+acerbically:problem
+ache:problem
+ached:problem
+aches:problem
+achey:problem
+aching:problem
+acrid:problem
+acridly:problem
+acridness:problem
+acrimonious:problem
+acrimoniously:problem
+acrimony:problem
+adamant:problem
+adamantly:problem
+addict:problem
+addicted:problem
+addicting:problem
+addicts:problem
+admonish:problem
+admonisher:problem
+admonishingly:problem
+admonishment:problem
+admonition:problem
+adulterate:problem
+adulterated:problem
+adulteration:problem
+adulterier:problem
+adversarial:problem
+adversary:problem
+adverse:problem
+adversity:problem
+afflict:problem
+affliction:problem
+afflictive:problem
+affront:problem
+afraid:problem
+aggravate:problem
+aggravating:problem
+aggravation:problem
+aggression:problem
+aggressive:problem
+aggressiveness:problem
+aggressor:problem
+aggrieve:problem
+aggrieved:problem
+aggrivation:problem
+aghast:problem
+agonies:problem
+agonize:problem
+agonizing:problem
+agonizingly:problem
+agony:problem
+aground:problem
+ail:problem
+ailing:problem
+ailment:problem
+aimless:problem
+alarm:problem
+alarmed:problem
+alarming:problem
+alarmingly:problem
+alienate:problem
+alienated:problem
+alienation:problem
+allegation:problem
+allegations:problem
+allege:problem
+allergic:problem
+allergies:problem
+allergy:problem
+aloof:problem
+altercation:problem
+ambiguity:problem
+ambiguous:problem
+ambivalence:problem
+ambivalent:problem
+ambush:problem
+amiss:problem
+amputate:problem
+anarchism:problem
+anarchist:problem
+anarchistic:problem
+kokai publication no. 2000-331699 is not provided with a leaking liquid sensor:problem
+anarchy:problem
+anemic:problem
+are not compatible:problem
+an aqueous solution of methanol or water leaks from the dmfc unit:problem
+many commercially available flow-rate sensors are generally considered to be incompatible with existing liquid-cooling systems suitable for computer systems:problem
+leakage may occur during mating and demating:problem
+a maximum number of passengers is thus attained for such a configuration of 550 passengers (with five doors):problem
+it is however not ruled out to provide a stairway or an elevator in the aircraft:problem
+many of these requirements are imposed by law or regulation:problem
+such doors are known in the aircraft industry:problem
+that space is lost with the curved ramps proposed by ep2460727:problem
+the additional equipment required for multiple use needs to be stowed away in a space-saving way in the aircraft during the time it is not required:problem
+the first and second supply areas may coincide or may partly overlap:problem
+the ud-ccrc is one example of a crc:problem
+therefore a flexibility of the light projection is considerably increased:problem
+thus the possible scope of use of the module is significantly enhanced:problem
+some of the sealed perforations may pop so that a hole exists in the continuous sheet of polymer:Problem

App/assets/trainingsPositive ADDED Viewed

	@@ -0,0 +1,213 @@

+ameliorate:partialSolution
+detect:partialSolution
+enhance:partialSolution
+ameliorate:partialSolution
+detect:partialSolution
+enhance:partialSolution
+unit 102 comprises two beds and an upper stowage compartment:partialSolution
+provides:partialSolution
+are within the scope:partialSolution
+facilitate:partialSolution
+improve:partialSolution
+maintain:partialSolution
+measure:partialSolution
+preserve:partialSolution
+save:partialSolution
+stabilize:partialSolution
+including:partialSolution
+includes:partialSolution
+included:partialSolution
+better:partialSolution
+allow:partialSolution
+detect:partialSolution
+amelioration:partialSolution
+ensure:partialSolution
+ensures:partialSolution
+enhancement:partialSolution
+improvement:partialSolution
+detection:partialSolution
+improvement:partialSolution
+maintenance:partialSolution
+at least:partialSolution
+are:partialSolution
+solution:partialSolution
+include:partialSolution
+includes:partialSolution
+small construction:partialSolution
+low power consumption:partialSolution
+advantageously:partialSolution
+is adapted:partialSolution
+provided:partialSolution
+can be:partialSolution
+beneficiate:partialSolution
+abound:partialSolution
+abounds:partialSolution
+abundance:partialSolution
+abundant:partialSolution
+accessable:partialSolution
+accessible:partialSolution
+acclaim:partialSolution
+acclaimed:partialSolution
+acclamation:partialSolution
+accolade:partialSolution
+accolades:partialSolution
+accommodative:partialSolution
+accomodative:partialSolution
+accomplish:partialSolution
+accomplished:partialSolution
+accomplishment:partialSolution
+accomplishments:partialSolution
+accurate:partialSolution
+accurately:partialSolution
+achievable:partialSolution
+achievement:partialSolution
+achievements:partialSolution
+achievible:partialSolution
+acumen:partialSolution
+adaptable:partialSolution
+adaptive:partialSolution
+adequate:partialSolution
+adjustable:partialSolution
+admirable:partialSolution
+admirably:partialSolution
+admiration:partialSolution
+admire:partialSolution
+admirer:partialSolution
+admiring:partialSolution
+admiringly:partialSolution
+adorable:partialSolution
+adore:partialSolution
+adored:partialSolution
+adorer:partialSolution
+adoring:partialSolution
+adoringly:partialSolution
+adroit:partialSolution
+adroitly:partialSolution
+adulate:partialSolution
+adulation:partialSolution
+adulatory:partialSolution
+advanced:partialSolution
+advantage:partialSolution
+advantageous:partialSolution
+advantageously:partialSolution
+advantages:partialSolution
+adventuresome:partialSolution
+adventurous:partialSolution
+advocate:partialSolution
+advocated:partialSolution
+advocates:partialSolution
+affability:partialSolution
+affable:partialSolution
+affably:partialSolution
+affectation:partialSolution
+affection:partialSolution
+affectionate:partialSolution
+affinity:partialSolution
+affirm:partialSolution
+affirmation:partialSolution
+affirmative:partialSolution
+affluence:partialSolution
+affluent:partialSolution
+afford:partialSolution
+affordable:partialSolution
+affordably:partialSolution
+afordable:partialSolution
+agile:partialSolution
+agilely:partialSolution
+agility:partialSolution
+agreeable:partialSolution
+agreeableness:partialSolution
+agreeably:partialSolution
+all-around:partialSolution
+alluring:partialSolution
+alluringly:partialSolution
+altruistic:partialSolution
+altruistically:partialSolution
+amaze:partialSolution
+amazed:partialSolution
+amazement:partialSolution
+amazes:partialSolution
+amazing:partialSolution
+amazingly:partialSolution
+ambitious:partialSolution
+ambitiously:partialSolution
+ameliorate:partialSolution
+amenable:partialSolution
+amenity:partialSolution
+amiability:partialSolution
+amiabily:partialSolution
+amiable:partialSolution
+amicability:partialSolution
+amicable:partialSolution
+amicably:partialSolution
+amity:partialSolution
+ample:partialSolution
+amply:partialSolution
+amuse:partialSolution
+amusing:partialSolution
+amusingly:partialSolution
+angel:partialSolution
+angelic:partialSolution
+apotheosis:partialSolution
+appeal:partialSolution
+appealing:partialSolution
+applaud:partialSolution
+appreciable:partialSolution
+appreciate:partialSolution
+appreciated:partialSolution
+appreciates:partialSolution
+appreciative:partialSolution
+appreciatively:partialSolution
+appropriately:paritalSolution
+the accumulated grease and particulate matter within reservoir is then appropriately discarded:partialSolution
+such materials are inexpensive and readily processed:partialSolution
+more or fewer sensors per hose line or hose line segment may be deployed if desired:partialSolution
+cable is saved in the supply channel due to the use of the flexible supply module:	partialSolution
+these carts 10 are arranged in three superposed rows of four carts 10: 	partialSolution
+these supply modules are usually arranged above the group of seats beneath the overhead compartment:partialSolution
+the cabin lighting is seated on the light halo:partialSolution
+the continuous adaptable light projection is adapted as a projection of a holographic film:partialSolution
+a corresponding design of a fastening element may make it possible in a simple manner to provide a combined rotational and translational movement:partialSolution
+a crc that is also to be used by passengers should have a height clearance that makes it possible for people to comfortably stand up:partialSolution
+a further possible supply medium is medical oxygen:partialSolution
+a lager variety of surfaces and positions can be offered to the passenger and therefore the comfort and the flexibility of handling of functional units is increased by means of the operator surface:partialSolution
+a motorized mechanism not described in detail here allows movement of platform 30 :partialSolution
+a passenger service unit comprising a lighting device according to claim 1 :partialSolution
+a passenger service unit comprising a lighting device according to one of claims 1 to 8:partialSolution
+a small overall height may result in a weight of only a few kg:partialSolution
+all the seats are disposed in groups 42 of two columns of seats:partialSolution
+control surfaces may also be activated in a variable manner and made available to each individual passenger:partialSolution
+crcs are separate rooms that are only available for use by members of the crew:partialSolution
+each gasper 66 includes a vent opening through which air is controllably emitted:partialSolution
+each of the supply modules 710 is connected to a data bus 720 of the cms:partialSolution
+a supply unit comprises a lighting device:partialSolution
+the operator surface comprises a continuous adaptable illumination pattern:partialSolution
+the suitable body is formed by a passenger's hand:partialSolution
+it may also be used to receive additional seats:partialSolution
+massage is thus not only relaxing but also health promoting:partialSolution
+one or two rows of seats may be supplied:partialSolution
+seats may also be disposed at that location:partialSolution
+second compartment 24 thus is arranged above first compartment 22 but at a distance from the latter:partialSolution
+steps 901 and 904 are optional:partialSolution
+synonyms for certain terms are provided:partialSolution
+the above-mentioned electro-optical devices may be used for modulating the light:partialSolution
+the centre of the room comprises a universal gaming table 3702 :	 partialSolution
+the external system comprises a cms:partialSolution
+the light forming module 60 is adapted to the light source:partialSolution
+the light source of the illumination unit may be a single white led which generates a very intense light with a low power consumption:partialSolution
+the pisa has a small construction and a low power consumption:partialSolution
+the psu pod assembly 77 can also include a panel stiffener 96 :partialSolution
+the psu pods are positioned below the first height and the psu channel is positioned above the first height:partialSolution
+the psu pods are positioned below the system components:partialSolution
+the resulting connection is then rather of a visual nature:partialSolution
+the resulting laser beam is combined with a mems mirror/scanner 4 for a dynamic light modulation and a reflection sensor 23 :partialSolution
+the second zone of the first deck advantageously has a substantially planar floor:partialSolution
+the space adjacent the ramps is not lost space:partialSolution
+the stowage space 76 then advantageously continues over the trolleys or the toilets:partialSolution
+the unit further comprises the front panel 1202 :partialSolution
+these sites 62 may be equipped for securing in place a wheelchair:partialSolution
+unit 102 comprises two beds and an upper stowage compartment:partialSolution
+unit 104 comprises three beds :partialSolution
+whole-body or partial-body massage are offered:partialSolution
+the analysis element is powered by durable long duration battery:partialSolution
+to ml of 2.5 % w/v pva in water in a ml glass vial is added ml of the polymer solution dropwise with stirring:partialSolution

App/assets/wordAfterNumber ADDED Viewed

	@@ -0,0 +1,95 @@

+\b[ mm ]\b
+\bmm\b
+\bnm\b
+\bor\b
+\bcm\b
+\]
+\[
+<
+>
+%
+\'
+°
+\brpm\b
+\bdeg\b
+\bdegree\b
+\bdegrees\b
+\bx\b
+\bkPa\b
+\bm\b
+\bpounds\b
+\bseconds\b
+\bsecond\b
+\bcc\b
+\bcc/sec\b
+\bpsi\b
+\bmol/mol\b
+\bm2\b
+\bm/s2\b
+\bm/m\b
+\bhz\b
+\bm\b
+\bcm2\b
+\br/min\b
+\bm/m\b
+\bg/mol\b
+\bkg\b
+\bkg/s\b
+\bm2/s\b
+\bpa\b
+\bkg/m3\b
+\bpa/pa\b
+\bµm\b
+\bk\b
+\bcm\b
+\b°c\b
+\b°f\b
+\b°\b
+\bs\b
+\bm3\b
+\bm3/s\b
+\bg/g\b
+\bar\b
+\bc\b
+\bch2o\b
+\bch3oh\b
+\bch4\b
+\bc2h4o\b
+\bc2h5oh\b
+\bc2h6\b
+\bc3h7oh\b
+\bc3h8\b
+\bc4h10\b
+\bc5h12\b
+\bco\b
+\bco2\b
+\bh\b
+\bh2\b
+\bh2o\b
+\bh2so4\b
+\bhc\b
+\bhe\b
+\b85kr\b
+\bn2\b
+\bnh3\b
+\bnmhc\b
+\bnmhce\b
+\bno\b
+\bno2\b
+\bnox\b
+\bn2o\b
+\bnmog\b
+\bnonmhc\b
+\bnothc\b
+\bo2\b
+\bohc\b
+\bpm\b
+\bs\b
+\bsvoc\b
+\bthc\b
+\bthce\b
+\bzro2\b
+\bpercent\b
+\bpercents\b
+\.
+\bto\b

App/assets/wordBeforeNumber ADDED Viewed

	@@ -0,0 +1,20 @@

+\bclaim\b
+\bclaims\b
+\[
+\bmm\b
+\bnm\b
+\bto\b
+<
+>
+%
+°
+\ba\b
+\.
+\bkPa\b
+\bof\b
+us
+\bx\b
+\bapproximately\b
+\bthe\b
+\bbeetween\b
+\brpm\b

App/assets/wordtagVerb ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ VBZ
2	+ VBG

App/bin/ClassifierWithIncr.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# -*- coding: utf-8 -*-
+"""
+basic_sentiment_analysis
+~~~~~~~~~~~~~~~~~~~~~~~~
+This module contains the code and examples described in
+http://fjavieralba.com/basic-sentiment-analysis-with-python.html
+"""
+from pprint import pprint
+import nltk
+import yaml
+import sys
+import os
+import re
+from App.bin.constants import ASSETS
+class Splitter(object):
+    def __init__(self):
+        self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
+        self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()
+    def split(self, text):
+        """
+        input format: a paragraph of text
+        output format: a list of lists of words.
+            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
+        """
+        sentences = self.nltk_splitter.tokenize(text)
+        tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
+        return tokenized_sentences
+class POSTagger(object):
+    def __init__(self):
+        pass
+    def pos_tag(self, sentences):
+        """
+        input format: list of lists of words
+            e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
+        output format: list of lists of tagged tokens. Each tagged tokens has a
+        form, a lemma, and a list of tags
+            e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])],
+                    [('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]]
+        """
+        pos = [nltk.pos_tag(sentence) for sentence in sentences]
+        # adapt format
+        pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos]
+        return pos
+class DictionaryTagger(object):
+    def __init__(self, dictionary_paths):
+        files = [open(path, 'r') for path in dictionary_paths]
+        dictionaries = [yaml.safe_load(dict_file) for dict_file in files]
+        map(lambda x: x.close(), files)
+        self.dictionary = {}
+        self.max_key_size = 0
+        for curr_dict in dictionaries:
+            for key in curr_dict:
+                if key in self.dictionary:
+                    self.dictionary[key].extend(curr_dict[key])
+                else:
+                    self.dictionary[key] = curr_dict[key]
+                    self.max_key_size = max(self.max_key_size, len(key))
+    def tag(self, postagged_sentences):
+        return [self.tag_sentence(sentence) for sentence in postagged_sentences]
+    def tag_sentence(self, sentence, tag_with_lemmas=False):
+        """
+        the result is only one tagging of all the possible ones.
+        The resulting tagging is determined by these two priority rules:
+            - longest matches have higher priority
+            - search is made from left to right
+        """
+        tag_sentence = []
+        N = len(sentence)
+        if self.max_key_size == 0:
+            self.max_key_size = N
+        i = 0
+        while (i < N):
+            j = min(i + self.max_key_size, N)  # avoid overflow
+            tagged = False
+            while (j > i):
+                expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
+                expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
+                if tag_with_lemmas:
+                    literal = expression_lemma
+                else:
+                    literal = expression_form
+                if literal in self.dictionary:
+                    # self.logger.debug("found: %s" % literal)
+                    is_single_token = j - i == 1
+                    original_position = i
+                    i = j
+                    taggings = [tag for tag in self.dictionary[literal]]
+                    tagged_expression = (expression_form, expression_lemma, taggings)
+                    if is_single_token:  # if the tagged literal is a single token, conserve its previous taggings:
+                        original_token_tagging = sentence[original_position][2]
+                        tagged_expression[2].extend(original_token_tagging)
+                    tag_sentence.append(tagged_expression)
+                    tagged = True
+                else:
+                    j = j - 1
+            if not tagged:
+                tag_sentence.append(sentence[i])
+                i += 1
+        return tag_sentence
+class ClassifyWithIncr_it(object):
+    def __init__(self):
+        print("printing")
+    def value_of(self,sentiment):
+        if sentiment == 'positive': return 1
+        if sentiment == 'negative': return -1
+        return 0
+    def sentence_score(self, sentence_tokens, previous_token, acum_score):
+        if not sentence_tokens:
+            return acum_score
+        else:
+            current_token = sentence_tokens[0]
+            tags = current_token[2]
+            token_score = sum([self.value_of(tag) for tag in tags])
+            if previous_token is not None:
+                previous_tags = previous_token[2]
+                if 'inc' in previous_tags:
+                    token_score *= 2.0
+                elif 'dec' in previous_tags:
+                    token_score /= 2.0
+                elif 'inv' in previous_tags:
+                    token_score *= -1.0
+            return self.sentence_score(sentence_tokens[1:], current_token, acum_score + token_score)
+    def sentiment_score(self,review):
+        return sum([self.sentence_score(sentence, None, 0.0) for sentence in review])
+    def main(self,sentence):
+        splitter = Splitter()
+        postagger = POSTagger()
+        pos=ASSETS+"dicts/positive.yml"
+        neg= ASSETS+"dicts/negative.yml"
+        inc=ASSETS+"dicts/inc.yml"
+        dec=ASSETS+"dicts/dec.yml"
+        inv=ASSETS+"dicts/inv.yml"
+        dicttagger = DictionaryTagger([pos, neg,
+                                       inc, dec, inv])
+        splitted_sentences = splitter.split(sentence)
+        pos_tagged_sentences = postagger.pos_tag(splitted_sentences)
+        dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)
+        print("Classification...")
+        result = self.sentiment_score(dict_tagged_sentences)
+        print (result)
+        if result < 0:
+            polarity = "problem"
+        elif result > 0:
+            polarity ="partialSolution"
+        else:
+            polarity = "neutre"
+        return polarity
+if __name__ == '__main__':
+    text = """this/these can be annoying"""
+    test = ClassifyWithIncr_it()
+    print(test.main(text))

App/bin/ComplexParser.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Mon Nov 28 16:02:26 2016
+@author: Achille Souili
+"""
+import re
+import nltk
+class ComplexParser(object):
+    def __init__(self, sentence):
+        self.sentence = sentence
+    def extract_parameters(self):
+        sentence = self.sentence
+        concept = []
+        words = nltk.word_tokenize(sentence)
+        sentence = nltk.pos_tag(words)
+        grammar = """CLAUSES: {<DT>?<JJ.*>?<DT><NN><.*>?<VB.*>?<.*>+}
+                              """
+        parameter_parser = nltk.RegexpParser(grammar)
+        tree = parameter_parser.parse(sentence)
+        for subtree in tree.subtrees():
+            if subtree.label() == 'CLAUSES':
+                #print(subtree)
+                parameter_candidate = " ".join(word for word, tag in subtree.leaves())
+                concept.append(parameter_candidate)
+        concept = "d".join(concept)
+        return concept
+if __name__ == "__main__":
+    Paragraph = "in which the surface of diffusion (24) is concave."
+    words = nltk.word_tokenize(Paragraph)
+    tagged = nltk.pos_tag(words)
+    print(tagged)
+    get_parameter = ComplexParser(Paragraph)
+    parameters_list = get_parameter.extract_parameters()
+    print (parameters_list)

App/bin/CorpusProcessor.py ADDED Viewed

	@@ -0,0 +1,460 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import json
+import os
+import re
+import matplotlib.pyplot as plt
+import numpy as np
+import Levenshtein
+from io import StringIO
+from App.bin import constants
+import hashlib
+from collections import OrderedDict
+from App.bin.InformationExtractor import InformationExtractor
+from App.bin.ParameterExtractor import ParameterExtractor
+from App.bin.TechnologyFinder import TechnologyFinder
+from App.bin.InformationExtractor_Claims import InformationExtractorClaims
+class CorpusProcessor(object):
+    def __init__(self, patents,input_folder, file_extension):
+        self.patents = patents
+        self.input_folder = input_folder
+        self.file_extension = file_extension
+        print("Processing started")
+    def make_graphic (self, sizes, text, colors, labels):
+        col = [[i / 255. for i in c] for c in colors]
+        fig, ax = plt.subplots()
+        ax.axis('equal')
+        width = 0.35
+        kwargs = dict(colors=col, startangle=180)
+        outside, _ = ax.pie(sizes, radius=1, pctdistance=1 - width / 2, labels=labels, **kwargs)
+        plt.setp(outside, width=width, edgecolor='white')
+        kwargs = dict(size=20, fontweight='bold', va='center')
+        ax.text(0, 0, text, ha='center', **kwargs)
+        plt.show()
+    def change_keys(self, dictionnary, number):
+        number = number+'-'
+        if type(dictionnary) is dict:
+            return dict([(number+str(k) , self.change_keys(v, number)) for k, v in dictionnary.items()])
+        else:
+            return dictionnary
+    def process_corpus(self):
+        count_abstract = 0
+        count_claims = 0
+        count_description = 0
+        count_patent = 0
+        total_sentences_number =0
+        count_concepts_solupart = 0
+        count_concepts_problem = 0
+        patents = self.patents
+        input_folder = self.input_folder
+        file_extension = self.file_extension
+        project_folder = os.path.basename(os.path.normpath(input_folder))
+        graph_folder = constants.GRAPH_FOLDER + project_folder+"/"
+        extracted_concepts = []
+        output_result = []
+        parameters_graph = []
+        reduced_content = []
+        patent_corpus = []
+        source_list = []
+        parameters_list =[]
+        technologies_graph =[]
+        for patent_file in patents:
+            output_json_claims ={}
+            total_sentences_number_claims =0
+            if type(patent_file) is dict:
+                patent_file = json.dumps(patent_file)
+            read_patent = StringIO(patent_file)
+            patent = json.load(read_patent)
+            nNumber = patent['number']
+            aAbstract = patent['abstract']
+            cClaims = patent['claims']
+            dDescription = patent['description']
+            root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&'
+            root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?'
+            if nNumber is not None:
+                match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', nNumber)
+                # CC for country code
+                CC = match.group(1)
+                # NR for Number
+                NR = match.group(2)
+                NR = re.sub(r'\s', '', NR)
+                # KC for Kind code
+                KC = match.group(4)
+                urlImg = root_img_url + '&CC=' + CC + '&NR=' + NR + '&KC=' + KC
+                urlPDF = root_pdf_url + 'CC=' + CC + '&NR=' + NR + '&KC=' + KC + '&FT=D&ND=3&date=' + '&DB=&locale=en_EP#'
+            #Find a more elegant way to do it
+            patent_content = aAbstract + cClaims + dDescription
+            patent_content = patent_content.splitlines()
+            # for line in patent_content:
+            #     line = self.dataCleaner(line)
+            #     reduced_content.append(line)
+            for line in patent_content:
+                get_parameters = ParameterExtractor(line)
+                parameters = get_parameters.extract_parameters()
+                if parameters:
+                    parameters_list.extend( parameters)
+            for i in parameters_list:
+                for j in parameters_list:
+                    if i != j and len(i.split()) == 1:
+                        if j.find(i) > -1 and i in parameters_list:
+                            parameters_list.remove(i)
+            parameters_list=list(set(parameters_list))
+            if len(parameters_list) > 50:
+                for i in parameters_list:
+                    for j in parameters_list:
+                        if i!=j:
+                            comp = Levenshtein.ratio(i, j)
+                            if comp >=.4 and i in parameters_list and j in parameters_list:
+                                if len(i) > len(j):
+                                    # print('{} is near duplicate of {}'.format(i, j))
+                                    parameters_list.remove(i)
+                for el in parameters_list:
+                    if len(el.split()) == 1:
+                        parameters_list.remove(el)
+            parameters = dict(enumerate(parameters_list, 1))
+            parameters = self.change_keys(parameters, nNumber.lower())
+            source = input_folder+"/"+nNumber+file_extension.strip("*")
+            parameters_array = OrderedDict({
+                        "concept": {
+                            "source": source,
+                            "valeurs": parameters,
+                            "image": urlImg,
+                            "pdf": urlPDF
+                        }
+                    })
+            pParameters= json.dumps(parameters_array, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
+            parameters_graph.append(pParameters)
+            if dDescription !="" or cClaims!="":
+                count_description +=1
+                extract_concepts = InformationExtractor(dDescription,input_folder, file_extension, nNumber )
+                output_json, total_sentences_number = extract_concepts.get_from_description()
+                extract_concepts_claims = InformationExtractorClaims(cClaims,input_folder, file_extension, nNumber )
+                output_json_claims_result= extract_concepts_claims.main()
+                if output_json_claims_result is not None:
+                    output_json_claims, total_sentences_number_claims = output_json_claims_result
+                count_claims += 1
+                if output_json is not None:
+                    if type(output_json) is dict:
+                        output_json = json.dumps(output_json)
+                    extracted_concepts.append(output_json)
+                    total_sentences_number += total_sentences_number
+                if output_json_claims is not None :
+                    if type(output_json_claims) is dict:
+                        output_json_claims = json.dumps(output_json_claims)
+                    extracted_concepts.append(output_json_claims)
+                    total_sentences_number += total_sentences_number_claims
+            elif cClaims !="":
+                count_claims +=1
+                print('Processing claims')
+            else:
+                count_abstract +=1
+                print("processing abstract")
+            count_patent +=1
+            #print(source)
+            source_list.append(source)
+            patent_corpus.append(reduced_content)
+        patent_corpus = dict(zip(source_list, patent_corpus))
+        '''
+        get_patent_technologies = TechnologyFinder(patent_corpus)
+        technologies = get_patent_technologies.get_technologies()
+        for source_file, technologies_list in technologies.items():
+            technologies_array = OrderedDict({
+                "concept": {
+                    "source": source_file,
+                    "values": technologies_list
+                }
+            })
+            tTechnologies = json.dumps(technologies_array, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
+            technologies_graph.append(tTechnologies)
+'''
+        print(type(extracted_concepts))
+        header = '{'
+        graph = '"problem_graph": [%s],' % ','.join(extracted_concepts)
+        parameters_output = '"parameters": [%s]' % ','.join(parameters_graph)
+        #technologies_output = '"technologies": [%s]' % ','.join(technologies_graph)
+        footer = '}'
+        #output_result.extend((header, graph, parameters_output,technologies_output, footer ))
+        output_result.extend((header, graph, parameters_output,  footer))
+        output_result = "".join(output_result)
+        output_result = re.sub(r'\,{2,}', ',', output_result)
+        output_result = re.sub(r'\}\,\]', '}]', output_result)
+        # exit()
+        # print(output_result)
+        concepts_json = json.loads(output_result)
+        # concepts_json = json.loads(concepts_json)
+        count_concepts = len(concepts_json['problem_graph'])
+        for item, value in concepts_json.items():
+            #if cle == "type" and value =="partialSolution":
+             #   print ("yes")
+            for element in value:
+                for cle, valeur in element.items():
+                    for k,v in valeur.items():
+                        if k == "type" and v =="partialSolution":
+                            count_concepts_solupart += 1
+                        elif k == "type" and v =="problem":
+                            count_concepts_problem += 1
+        json_write_to_file = json.dumps(concepts_json, sort_keys=False, indent=4, separators=(',', ': '))
+        #print(concepts_json.keys())
+        # original code
+        with open(graph_folder+"graph.json", 'w') as json_graph:
+        # with open(graph_folder + 'graph.json', 'w') as json_graph:
+                json_graph.write(json_write_to_file)
+        number_neutre = count_concepts - count_concepts_problem - count_concepts_solupart
+        print("Le corpus contenait %s brevets dont %s abstract, %s revendications et %s descriptions" % (count_patent, count_abstract, count_claims, count_description))
+        print("%s phrases ont été analysée(s)" % (total_sentences_number))
+        print("%s concepts ont été trouvé(s) dont %s problèmes, %s solutions partielles et %s neutres" % (count_concepts, count_concepts_problem, count_concepts_solupart, number_neutre))
+        #Display graphics
+        first_color = (46, 204, 113)
+        second_color = (245, 176, 65)
+        #self.make_graphic([count_concepts_problem, count_concepts_solupart], "Ratio",[first_color,second_color],['Problems','Partial Solutions'])
+        return json_write_to_file
+    def process_corpus_json(self):
+        count_abstract = 0
+        count_claims = 0
+        count_description = 0
+        count_patent = 0
+        total_sentences_number = 0
+        count_concepts_solupart = 0
+        count_concepts_problem = 0
+        patents = self.patents
+        input_folder = self.input_folder
+        file_extension = self.file_extension
+        project_folder = os.path.basename(os.path.normpath(input_folder))
+        graph_folder = constants.GRAPH_FOLDER + project_folder + "/"
+        extracted_concepts = []
+        output_result = []
+        parameters_graph = []
+        reduced_content = []
+        patent_corpus = []
+        source_list = []
+        parameters_list = []
+        technologies_graph = []
+        for patent_file in patents:
+            # print(type(patent_file))
+            #if type(patent_file) is dict:
+            patent_file = json.dumps(patent_file)
+            read_patent = StringIO(patent_file)
+            patent = json.load(read_patent)
+            # print(type(patent))
+            filename = patent['filename']
+            nNumber = patent['number']
+            aAbstract = patent['abstract']
+            cClaims = patent['claims']
+            dDescription = patent['description']
+            # Find a more elegant way to do it
+            patent_content = aAbstract + cClaims + dDescription
+            patent_content = patent_content.splitlines()
+            # for line in patent_content:
+            #     line = self.dataCleaner(line)
+            #     reduced_content.append(line)
+            for line in patent_content:
+                get_parameters = ParameterExtractor(line)
+                parameters = get_parameters.extract_parameters()
+                if parameters:
+                    parameters_list.extend(parameters)
+            for i in parameters_list:
+                for j in parameters_list:
+                    if i != j and len(i.split()) == 1:
+                        if j.find(i) > -1 and i in parameters_list:
+                            parameters_list.remove(i)
+            parameters_list = list(set(parameters_list))
+            if len(parameters_list) > 50:
+                for i in parameters_list:
+                    for j in parameters_list:
+                        if i!=j:
+                            comp = Levenshtein.ratio(i, j)
+                            if comp >=.4 and i in parameters_list and j in parameters_list:
+                                if len(i) > len(j):
+                                    # print('{} is near duplicate of {}'.format(i, j))
+                                    parameters_list.remove(i)
+                for el in parameters_list:
+                    if len(el.split()) == 1:
+                        parameters_list.remove(el)
+            print('{} {}'.format('Taille: ', len(parameters_list)))
+            parameters = dict(enumerate(parameters_list, 1))
+            parameters = self.change_keys(parameters, nNumber.lower())
+            source = input_folder + "/" + nNumber + file_extension.strip("*")
+            parameters_array = OrderedDict({
+                "concept": {
+                    "source": source,
+                    "valeurs": parameters
+                }
+            })
+            pParameters = json.dumps(parameters_array, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
+            parameters_graph.append(pParameters)
+            #if dDescription != "" and cClaims!="":
+            if dDescription != "":
+                count_description += 1
+                extract_concepts = InformationExtractor(dDescription, input_folder, file_extension, filename)
+                output_json, total_sentences_number_d = extract_concepts.get_from_description()
+                if output_json != "":
+                    extracted_concepts.append(output_json)
+                total_sentences_number += total_sentences_number_d
+                #count_claims += 1
+                #extract_concepts = InformationExtractor(cClaims, input_folder, file_extension, nNumber)
+                #output_json, total_sentences_number_c = extract_concepts.get_from_claims()
+                #if output_json != "":
+                    #extracted_concepts.append(output_json)
+                #total_sentences_number_c += total_sentences_number_c
+                #total_sentences_number = total_sentences_number_c+total_sentences_number_d
+            elif cClaims != "":
+                count_claims += 1
+                extract_concepts = InformationExtractor(cClaims, input_folder, file_extension, nNumber)
+                output_json, total_sentences_number = extract_concepts.get_from_claims()
+                if output_json != "":
+                    extracted_concepts.append(output_json)
+                total_sentences_number += total_sentences_number
+            elif dDescription != "":
+                count_description += 1
+                extract_concepts = InformationExtractor(dDescription, input_folder, file_extension, nNumber)
+                output_json, total_sentences_number = extract_concepts.get_from_description()
+                if output_json != "":
+                    extracted_concepts.append(output_json)
+                total_sentences_number += total_sentences_number
+                count_claims += 1
+            else:
+                count_abstract += 1
+                print("processing abstract")
+            count_patent += 1
+            # print(source)
+        #     source_list.append(source)
+        #     patent_corpus.append(reduced_content)
+        # patent_corpus = dict(zip(source_list, patent_corpus))
+        '''
+        get_patent_technologies = TechnologyFinder(patent_corpus)
+        technologies = get_patent_technologies.get_technologies()
+        for source_file, technologies_list in technologies.items():
+            technologies_array = OrderedDict({
+                "concept": {
+                    "source": source_file,
+                    "values": technologies_list
+                }
+            })
+            tTechnologies = json.dumps(technologies_array, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
+            technologies_graph.append(tTechnologies)
+'''
+        header = '{'
+        graph = '"problem_graph": [%s],' % ','.join(extracted_concepts)
+        parameters_output = '"parameters": [%s]' % ','.join(parameters_graph)
+        # technologies_output = '"technologies": [%s]' % ','.join(technologies_graph)
+        footer = '}'
+        # output_result.extend((header, graph, parameters_output,technologies_output, footer ))
+        output_result.extend((header, graph, parameters_output, footer))
+        output_result = "".join(output_result)
+        output_result = re.sub(r'\,{2,}', ',', output_result)
+        output_result = re.sub(r'\}\,\]', '}]', output_result)
+        concepts_json = json.loads(output_result)
+        count_concepts = len(concepts_json['problem_graph'])
+        for item, value in concepts_json.items():
+            # if cle == "type" and value =="partialSolution":
+            #   print ("yes")
+            for element in value:
+                for cle, valeur in element.items():
+                    for k, v in valeur.items():
+                        if k == "type" and v == "partialSolution":
+                            count_concepts_solupart += 1
+                        elif k == "type" and v == "problem":
+                            count_concepts_problem += 1
+        json_write_to_file = json.dumps(concepts_json, sort_keys=False, indent=4, separators=(',', ': '))
+        # print(concepts_json.keys())
+        with open(graph_folder + "graph.json", 'w') as json_graph:
+            json_graph.write(json_write_to_file)
+        print("Le corpus contenait %s brevets dont %s abstract, %s revendications et %s descriptions" % (
+        count_patent, count_abstract, count_claims, count_description))
+        print("%s phrases ont été analysée(s)" % (total_sentences_number))
+        print("%s concepts ont été trouvé(s) dont %s problèmes et %s solutions partielles" % (
+        count_concepts, count_concepts_problem, count_concepts_solupart))
+        # Display graphics
+        first_color = (46, 204, 113)
+        second_color = (245, 176, 65)
+        # self.make_graphic([count_concepts_problem, count_concepts_solupart], "Ratio",[first_color,second_color],['Problems','Partial Solutions'])
+        return json_write_to_file

App/bin/FiguresCleaner.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# -*- coding: utf-8 -*-
+import re
+import nltk
+import json
+from App.bin import constants
+class FiguresCleaner(object):
+    def __init__(self, sections):
+        self.sections = sections
+    def clean_figures(self):
+        sections = self.sections
+        clean_content = []
+        with open(constants.ASSETS + "wordAfterNumber", 'r') as l:
+            after_words = l.read().splitlines()
+            after_words_patterns = re.compile('|'.join(after_words))
+        with open(constants.ASSETS + "wordBeforeNumber", 'r') as l:
+            before_words = l.read().splitlines()
+            before_words_patterns = re.compile('|'.join(before_words))
+        #sections = sections.splitlines()
+        words = nltk.word_tokenize(sections)
+        tagged_words = nltk.pos_tag(words)
+        for i in range(len(tagged_words)):
+            if i < len(tagged_words) - 1:
+                next_word = tagged_words[i + 1][0]
+                current_word = tagged_words[i][0]
+                previous_word = tagged_words[i - 1][0]
+                currentWordTag = tagged_words[i][1]
+                if currentWordTag == 'CD' and not re.match(after_words_patterns,
+                                                           next_word) is not None and not re.match(
+                    before_words_patterns, previous_word) is not None:
+                    if re.search(r'\d', current_word) is not None:
+                        continue
+                else:
+                    clean_content.append(current_word + " ")
+            else:
+                clean_content.append("\n")
+        return clean_content

App/bin/FindTechnologies.py ADDED Viewed

	@@ -0,0 +1,64 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*
+import sys
+import os
+import math
+import xlsxwriter
+from textblob import TextBlob as tb
+class FindTechnologies(object):
+    def __init__(self):
+        print("Starting")
+    def tf(word, blob):
+        return (float)(blob.noun_phrases.count(word)) / (float)(len(blob.noun_phrases))
+    def n_containing(word, bloblist):
+        return sum(1 for blob in bloblist if word in blob.noun_phrases)
+    def idf(word, bloblist):
+        return math.log(len(bloblist) / (float)(1 + n_containing(word, bloblist)))
+    def tfidf(word, blob, bloblist):
+        return tf(word, blob) * idf(word, bloblist)
+    # Create an excel file for validation purpose
+    def get_technologies(self):
+        folder_path = "C:/Users/asouili01/Documents/PatSemBeta-v3/Data/input/Gaggenau/"
+        stopwords = open('C:/Users/asouili01/Documents/PIXSEB/Ressources/stopwords.txt', 'r').read().split('\r\n')
+        bloblist = []
+        filenamelist = []
+        for path, dirs, files in os.walk(folder_path):
+            for filename in files:
+                print(filename)
+                filenamelist.append(filename)
+                name, extension = filename.split('.')
+                filepath = folder_path + "/" + filename
+                filehandler = open(filepath, "r",encoding="utf-8")
+                content = filehandler.read()
+                filteredtext = [t for t in content if t.lower() not in stopwords]
+                filteredcontent = ''.join(filteredtext)
+                blob = 'blob_' + name.lower()
+                print (blob)
+                blob = tb(filteredcontent.lower())
+                bloblist.append(blob)
+                print(bloblist)
+        for i, blob in enumerate(bloblist):
+            print("Top words in document {}".format(i + 1))
+            scores = {word: tfidf(word, blob, bloblist) for word in blob.noun_phrases}
+            sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+            for word, score in sorted_words[:5]:
+                print("\tWord: {}, TF-IDF: {}".format(word, round(score, 10)))

App/bin/InformationExtractor.py ADDED Viewed

	@@ -0,0 +1,588 @@

+# -*- coding: utf-8 -*-
+#java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer --port 8080
+import nltk
+nltk.download('all')
+import os
+import re
+import json
+import hashlib
+import Levenshtein
+import uuid
+from App.bin import constants
+from collections import OrderedDict
+from nltk import word_tokenize
+from App.bin.SharpClassifier import SharpClassifier
+from App.bin.ClassifierWithIncr import ClassifyWithIncr_it
+from App.bin.SentenceClassifier import SentenceClassifier
+from App.bin.ParameterExtractor import ParameterExtractor
+class InformationExtractor(object):
+    patent_abbreviations = open(constants.ASSETS + "abbreviation_sentence_splitter").read().split()
+    sentence_finder = nltk.data.load('tokenizers/punkt/english.pickle')
+    sentence_finder._params.abbrev_types.update(patent_abbreviations)
+    def __init__(self, section, input_folder,file_extension, file_name):
+        self.section = section
+        self.input_folder = input_folder
+        self.file_extension = file_extension
+        self.file_name = file_name
+        print("Extracting problem graph")
+    #@staticmethod
+    def discardLines(self, line,lexic):
+        with open (constants.ASSETS+ lexic) as m:
+            exclusion_list = m.read().splitlines()
+            if any(word in line for word in exclusion_list):
+                pass
+            else:
+                return line
+    def selectLines(self, line, lexic):
+        with open(constants.ASSETS + lexic) as n:
+            inclusion_list = n.read().splitlines()
+            if any(word in line for word in inclusion_list):
+                return line
+    def last_cleansing(self, concept):
+        concept = str(concept)
+        concept = concept.lower()
+        if concept.endswith("."):
+            concept = concept.strip(".")
+        concept = re.sub(r'^consequently ','', concept)
+        concept = re.sub(r'^such ', '', concept)
+        concept = re.sub(r'^said ', '', concept)
+        concept = re.sub(r'^\s+', '', concept)
+        concept = re.sub(r'^it is worth noting that ', '', concept)
+        concept = re.sub(r'^example of ', '', concept)
+        concept = re.sub(r'^since ', '', concept)
+        concept = re.sub(r'^\( |\)$ ', '', concept)
+        return concept
+    # def get_from_claims(self):
+    #
+    #     section = self.section
+    #     content = []
+    #     sentence_finder = InformationExtractor.sentence_finder
+    #     sentences = sentence_finder.tokenize(section.strip())
+    #     with open(constants.ASSETS + "getFromClaims") as concept:
+    #         # next(concept)
+    #         included_words = concept.read().splitlines()
+    #         include_link_pattern = re.compile('|'.join(included_words))
+    def get_from_description(self):
+        previous_polarity = ''
+        noise_trash =[]
+        content = []
+        include_links = []
+        output_content = []
+        ex_output_content = []
+        output_result=[]
+        output_linked_content = []
+        output_inter_content = []
+        uniq_output_linked_content =[]
+        ex_output_content_linked =[]
+        section = self.section
+        input_folder = self.input_folder
+        file_name = self.file_name
+        file_extension = self.file_extension
+        projectFolder = os.path.basename(os.path.normpath(input_folder))
+        output_file_name = input_folder+"/"+file_name+file_extension.strip("*")
+        graphItemId = hashlib.md5(file_name.encode())
+        graphItemIdValue = graphItemId.hexdigest()
+        graphItemIdValue = str(uuid.uuid4())
+        t_sline = ""
+        t_sline_ex =[]
+        compt_Id = 30
+        compt_Id_ex = 40
+        root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&'
+        root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?'
+        if file_name is not None:
+            match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', file_name)
+            # CC for country code
+            CC = match.group(1)
+            # NR for Number
+            NR = match.group(2)
+            NR = re.sub(r'\s', '', NR)
+            # KC for Kind code
+            KC = match.group(4)
+            urlImg = root_img_url + '&CC=' + CC + '&NR=' + NR + '&KC=' + KC
+            urlPDF = root_pdf_url + 'CC=' + CC + '&NR=' + NR + '&KC=' + KC + '&FT=D&ND=3&date=' + '&DB=&locale=en_EP#'
+        sentence_finder = InformationExtractor.sentence_finder
+        #section = self.dataCleaner(section)
+        #print(section)
+        sentences = sentence_finder.tokenize(section.strip())
+        with open(constants.ASSETS + "includeLinks") as concept:
+            # next(concept)
+            included_words = concept.read().splitlines()
+            include_link_pattern = re.compile('|'.join(included_words))
+        #open examplification wordfile
+        with open(constants.ASSETS + "examplificationclues") as examplif:
+            # next(concept)
+            exam_words = examplif.read().splitlines()
+            examplif_word_pattern = re.compile('|'.join(exam_words))
+        description_sentences_number = len(sentences)
+        number_of_words = 0
+        for sentence in sentences:
+            # with open(constants.DATA + 'sentences.txt', 'a', encoding='utf8') as file_handler:
+            #     for item in sentences:
+            #         file_handler.write("{}\n".format(item))
+            number_of_word = len(nltk.word_tokenize(sentence))
+            number_of_words += number_of_word
+            sentenced = self.discardLines(sentence, "exclusionList")
+            if sentenced is not None:
+                content.append(sentenced)
+                #print("origine=> "+sentence)
+        total_sentences_number = len(sentences)
+        # mean_sentence_length = int(round(number_of_words/total_sentences_number))
+        # print(mean_sentence_length)
+        for line in content:
+            line = self.selectLines(line, "inclusionList")
+            if line is not None:
+                if re.match(include_link_pattern, line):
+                    include_links.append(line)
+                    #print(line)
+                if line.count(',') == 0:
+                    output_content.append(line)
+                    # content.remove(line)
+                if line.count(',') > 0:
+                    output_inter_content.append(line)
+                    content.remove(line)
+        for s in content:
+            # print(s, file_name)
+            sentence = self.discardLines(s, "FilterS")
+            if sentence is not None:
+                if s.count(',') <= 2 and re.match(examplif_word_pattern, s.lower()):
+                    s = str(s)
+                    cs = s.lower()
+                    cs = re.sub(examplif_word_pattern, '', cs)
+                    cs = re.sub('which', 'this/these', cs)
+                    cs = re.sub(r'\.$', '', cs)
+                    #print(s)
+                    if cs.count(',') == 1 and cs.count('such as')==0:
+                        ex_output_content_linked.append(cs)
+                    else:
+                        ex_output_content.append(cs)
+                elif s.count(',') == 1:
+                    s = str(s)
+                    s = s.lower()
+                    s = self.selectLines(s, "OneCommaDiscriminator")
+                    if s is not None:
+                    #s = re.sub('which', 'this/these', s)
+                    #print(s)
+                        s = re.sub(r'^thus, ', '', s)
+                        s = re.sub(r'^preferably, ', '', s)
+                        s = re.sub(r'^conventional ', '', s)
+                        s = re.sub(r'^in particular, ', '', s)
+                        s = re.sub(r'^specifically, ', '', s)
+                        s = re.sub(r'^as necessary, ', '', s)
+                        s = re.sub(', which', ',this/these', s)
+                        s = re.sub(r'\.$', '', s)
+                        if s.count(',')==1:
+                            ex_output_content_linked.append(s)
+                        else:
+                            ex_output_content.append(s)
+                else:
+                   pass
+        print(len(ex_output_content_linked))
+        ex_output_content_linked = list(set(ex_output_content_linked))
+        for line in ex_output_content_linked:
+            line = line.lower()
+            if 'figure' not in line:
+            #if line.count(',') <= 1:
+                t_sline_ex = line.strip().split(',')
+            #print("outpib"+str(t_sline_ex))
+            for concept in t_sline_ex:
+                #print("outpib" + str(concept))
+                words = nltk.word_tokenize(concept)
+                tagged = nltk.pos_tag(words)
+                #print(tagged)
+                parameters_list = []
+                compteur = 0
+                compt_Id_ex += 1
+                tagged = nltk.pos_tag(word_tokenize(concept))
+                tags = [word for word, pos in tagged if pos == 'VBZ' or pos == 'VBP'  or pos ==  'VBG' or pos == 'MD' or pos == 'JJR']
+                if len(tags) < 1:
+                    continue
+                # classifyT = SentenceClassifier(concept)
+                # polarite = classifyT.classifySentence()
+                classifyT = ClassifyWithIncr_it()
+                polarite = classifyT.main(concept)
+                # if polarite == 'neutre':
+                #     classify = SentenceClassifier(concept)
+                #     polarite = classify.classifySentence()
+                    # print(concept)
+                get_parameters = ParameterExtractor(concept)
+                parameters = get_parameters.extract_parameters()
+                parameters_list.extend( parameters)
+                # parameters_list=", ".join(parameters_list)
+                # parameters_list = parameters_list
+                #print("Index is: ")
+                #print(t_sline_ex.index(concept))
+                #print(concept)
+                clean_concept = self.last_cleansing(concept)
+                # if polarite == 'neutre':
+                #     words = word_tokenize(clean_concept)
+                #     hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)])
+                #     noise_trash.append(hit)
+                validity = self.discardLines(concept, 'referencing_indices')
+                if t_sline_ex.index(concept) == 0 and validity is not None:
+                    previous_polarity = polarite
+                    values = OrderedDict({
+                        "concept": {
+                            "type": polarite,
+                            "enfants": graphItemIdValue + str(compt_Id_ex + 1),
+                            "id": graphItemIdValue + str(compt_Id_ex),
+                            "sentence": clean_concept,
+                            "source": output_file_name,
+                            "parameters":parameters_list,
+                            "image": urlImg,
+                            "pdf": urlPDF
+                        }
+                    })
+                else:
+                    print("Previous polarity is : " + str(previous_polarity))
+                    if previous_polarity =='partialSolution' or validity is None:
+                        continue
+                    else:
+                        compteur += 1
+                        values = OrderedDict({
+                            "concept": {
+                                "type": polarite,
+                                "parents": graphItemIdValue + str(compt_Id_ex - 1),
+                                "id": graphItemIdValue + str(compt_Id_ex),
+                                "sentence": clean_concept,
+                                "source": output_file_name,
+                                "parameters": parameters_list,
+                                "image": urlImg,
+                                "pdf": urlPDF
+                            }
+                        })
+                json_string_linkes = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
+                output_result.append(json_string_linkes)
+        #for line in output_content:
+                #print ("include=> "+line)
+        #just examplification sentences
+        #make a function of that
+        ex_output_content = list(set(ex_output_content))
+        for concept in ex_output_content:
+            tagged = nltk.pos_tag(word_tokenize(concept))
+            tags = [word for word, pos in tagged if
+                    pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR']
+            if len(tags) < 1:
+                continue
+            parameters_list = []
+            concept = concept.lower()
+            compt_Id_ex += 1
+            # classify = SentenceClassifier(sline)
+            # polarite = classify.classifySentence()
+            classifyT = ClassifyWithIncr_it()
+            polarite = classifyT.main(concept)
+            # if polarite =='neutre':
+            #     classify = SentenceClassifier(concept)
+            #     polarite = classify.classifySentence()
+                # print(sline)
+            #if polarite == 'partialSolution':
+                #print(sline)
+                #Insert a classifier here
+            get_parameters = ParameterExtractor(concept)
+            parameters = get_parameters.extract_parameters()
+            clean_concept = self.last_cleansing(concept)
+            parameters_list.extend(parameters)
+            # if polarite == 'neutre':
+            #     words = word_tokenize(clean_concept)
+            #     hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)])
+            #     noise_trash.append(hit)
+            # parameters_list = ", ".join(parameters_list)
+            validity = self.discardLines(concept, 'referencing_indices')
+            if polarite != 'partialSolution' and validity is not None:
+                values = OrderedDict({
+                    "concept": {
+                        "type": polarite,
+                        "id": graphItemIdValue + str(compt_Id_ex),
+                        "sentence": clean_concept,
+                        "source": output_file_name,
+                        "parameters": parameters_list,
+                        "image": urlImg,
+                        "pdf": urlPDF
+                    }
+                })
+                json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
+                output_result.append(json_string)
+        for line in include_links:
+            #print(line)
+            #Put in lower case to improve matching
+            line = line.lower()
+            if re.match(r'however', line) and line.count(',') <= 1:
+                line = str(line)
+                sline = re.sub(r'however|,', '', line)
+                if sline not in output_linked_content:
+                    output_linked_content.append(sline)
+            if re.match(r'however', line) and line.count(',') > 1:
+                sline = re.sub(r'^however,?(\s\w+)\s*, that ', '', line)
+                # sline = re.sub(r'however,.+, that ', '', sline)
+                sline = re.sub(r'^however,?(\s\w+)+\s(above), ', '', sline)
+                sline = re.sub(r'^however,?\s\w+ed(\s\w+)+,\s*', '', sline)
+                sline = re.sub(r'^however,?\sif\s(desired|said)\s*,\s', '', sline)
+                sline = re.sub(r'^however,?\s(it)\s(will be appreciated)\s*,\s(that)+\s*', '', sline)
+                sline = re.sub(r'^however,?\s(as|if|because|when|since)\s*(?!is)', '', sline)
+                sline = re.sub(r'^however,?\s*', '', sline)
+                if sline not in output_linked_content:
+                    output_linked_content.append(sline)
+            if re.match(r'if', line) and line.count(',') <= 1:
+                line = str(line)
+                sline = re.sub(r'^if\s?(and when|not|desired|necessary)\s?,?\s*', '', line)
+                sline = re.sub(r'^if,?\s*', '', sline)
+                sline = re.sub(r'^if ', '', sline)
+                if sline not in output_linked_content:
+                    output_linked_content.append(sline)
+                # print (sline)
+            if re.match(r'when', line):
+                line = str(line)
+                line = line.lower()
+                sline = re.sub(r'^when\s*', '', line)
+                sline = re.sub(r'^when,?\s*', '', sline)
+                sline = re.sub(r'^when ', '', sline)
+                if sline not in output_linked_content:
+                    output_linked_content.append(sline)
+            if re.match(r'(^since)|(^\w+\s?,\s?since\s?)', line):
+                sline = re.sub(r'^since', '', line)
+                sline = re.sub(r'^\w+\s?,\s?since\s?', '', sline)
+                if sline not in output_linked_content:
+                    output_linked_content.append(sline)
+        for line in output_content:
+            line = line.lower()
+            if re.match(r'if', line):
+                line = str(line)
+                sline = re.sub(r'^if ', '', line)
+                if sline not in output_linked_content:
+                    output_content.append(sline)
+                #output_content.remove(line)
+        uniq_output_linked_content = list(set(output_linked_content))
+        for line in uniq_output_linked_content:
+            #print("long sentences = > " + line)
+            # line = str(i)
+            #print(line)
+            line = line.lower()
+            if 'figure' in line:
+                uniq_output_linked_content.remove(line)
+            sline = re.sub(r'^\s+', '', line)
+            sline = re.sub(r'^\d+\.+$', '', sline)
+            if sline.count(',') <= 1:
+                t_sline = tuple(sline.strip().split(', '))
+                #print("outpib"+str(t_sline))
+        for concept in t_sline:
+            tagged = nltk.pos_tag(word_tokenize(concept))
+            tags = [word for word, pos in tagged if
+                    pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR']
+            if len(tags) < 1:
+                continue
+            else:
+                parameters_list = []
+                compteur = 0
+                compt_Id += 1
+                # classifyT = SentenceClassifier(concept)
+                # polarite = classifyT.classifySentence()
+                tagged = nltk.pos_tag(word_tokenize(concept))
+                tags = [word for word, pos in tagged if pos.startswith('V') or pos == 'JJR']
+                if len(tags) < 1:
+                    continue
+                classifyT = ClassifyWithIncr_it()
+                polarite = classifyT.main(concept)
+                # if polarite == 'neutre':
+                #     classify = SentenceClassifier(concept)
+                #     polarite = classify.classifySentence()
+                    # print(concept)
+                get_parameters = ParameterExtractor(concept)
+                parameters = get_parameters.extract_parameters()
+                parameters_list.extend( parameters)
+                # parameters_list=", ".join(parameters_list)
+                # parameters_list = parameters_list
+                clean_concept = self.last_cleansing(concept)
+                validity = self.discardLines(concept, 'referencing_indices')
+                # if polarite == 'neutre':
+                #     words = word_tokenize(clean_concept)
+                #     hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)])
+                #     noise_trash.append(hit)
+                if t_sline.index(concept) == 0 and validity is not None:
+                    previous_polarity = polarite
+                    values = OrderedDict({
+                        "concept": {
+                            "type": polarite,
+                            "enfants": graphItemIdValue + str(compt_Id + 1),
+                            "id": graphItemIdValue + str(compt_Id),
+                            "sentence": clean_concept,
+                            "source": output_file_name,
+                            "parameters":parameters_list,
+                            "image": urlImg,
+                            "pdf": urlPDF
+                        }
+                    })
+                else:
+                    print("Previous polarity is : " + str(previous_polarity))
+                    if previous_polarity =='partialSolutiond' or validity is None:
+                        continue
+                    else:
+                        compteur += 1
+                        values = OrderedDict({
+                            "concept": {
+                                "type": polarite,
+                                "parents": graphItemIdValue + str(compt_Id - 1),
+                                "id": graphItemIdValue + str(compt_Id),
+                                "sentence": clean_concept,
+                                "source": output_file_name,
+                                "parameters": parameters_list,
+                                "image": urlImg,
+                                "pdf": urlPDF
+                            }
+                        })
+                json_string_linked = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
+                output_result.append(json_string_linked)
+        uniq_output_content = list(set(output_content))
+        for s in uniq_output_content:
+            for y in uniq_output_content:
+                if s != y:
+                    result = Levenshtein.ratio(s, y)
+                    if result > .7:
+                        # print(s + " :IS SIMILAR TO: " + y)
+                        if len(s) > len(y):
+                            uniq_output_content.remove(y)
+                        elif len(y) < len(s):
+                            uniq_output_content.remove(s)
+        for concept in uniq_output_content:
+            tagged = nltk.pos_tag(word_tokenize(concept))
+            tags = [word for word, pos in tagged if
+                    pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR']
+            if len(tags) < 1:
+                continue
+            parameters_list = []
+            concept = concept.lower()
+            compt_Id += 1
+            sline = re.sub(r'^if ', '', concept)
+            sline = re.sub(r'^(if|preferably) ', '', sline)
+            sline = re.sub(r'^\s+?said ', '', sline)
+            # classify = SentenceClassifier(sline)
+            # polarite = classify.classifySentence()
+            classifyT = ClassifyWithIncr_it()
+            polarite = classifyT.main(concept)
+            # if polarite =='neutre':
+            #     classify = SentenceClassifier(sline)
+            #     polarite = classify.classifySentence()
+                # print(sline)
+            #if polarite == 'partialSolution':
+                #print(sline)
+                #Insert a classifier here
+            get_parameters = ParameterExtractor(concept)
+            parameters = get_parameters.extract_parameters()
+            parameters_list.extend(parameters)
+            # parameters_list = ", ".join(parameters_list)
+            clean_concept = self.last_cleansing(sline)
+            # if polarite == 'neutre':
+            #     words = word_tokenize(clean_concept)
+            #     hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)])
+            #     noise_trash.append(hit)
+            validity = self.discardLines(concept, 'referencing_indices')
+            if polarite !='partialSolution' and validity is not None:
+                values = OrderedDict({
+                    "concept": {
+                        "type": polarite,
+                        "id": graphItemIdValue + str(compt_Id),
+                        "sentence": clean_concept,
+                        "source": output_file_name,
+                        "parameters": parameters_list,
+                        "image": urlImg,
+                        "pdf": urlPDF
+                    }
+                })
+                json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
+                output_result.append(json_string)
+                output_result = list(set(output_result))
+        output_json = ",".join(output_result)
+        return output_json, total_sentences_number

App/bin/InformationExtractor_Claims.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from App.bin.FiguresCleaner import FiguresCleaner
+from App.bin.ParameterExtractor import ParameterExtractor
+from App.bin import constants
+import nltk
+import re
+import os
+import json
+import hashlib
+import Levenshtein
+import uuid
+from collections import OrderedDict
+from App.bin.SharpClassifier import SharpClassifier
+from App.bin.ClassifierWithIncr import ClassifyWithIncr_it
+class InformationExtractorClaims(object):
+    def __init__(self, section, input_folder, file_extension, file_name):
+        self.section = section
+        self.input_folder = input_folder
+        self.file_extension = file_extension
+        self.file_name = file_name
+        patent_abbreviations = open(constants.ASSETS + "abbreviation_sentence_splitter").read().split()
+        sentence_finder = nltk.data.load('tokenizers/punkt/english.pickle')
+        sentence_finder._params.abbrev_types.update(patent_abbreviations)
+        self.sentence_finder = sentence_finder
+    def clean_data (self, sentence):
+        sentence = str(sentence.lower())
+        sentence = re.sub(r'\(\s,?\s?\)', '', sentence)
+        sentence = re.sub(r'\s+,', ',', sentence)
+        sentence = re.sub(r'^\d+', '', sentence)
+        sentence = re.sub(r'\s+', ' ', sentence)
+        if sentence is not None:
+            return sentence
+    def truncate_data (self, sentence):
+        sentence = str(sentence.lower())
+        sentence = re.sub(r'wherein said\s*', '', sentence)
+        sentence = re.sub(r'characterized in that said\s*|characterised in that said?\s*', '', sentence)
+        sentence = re.sub(r'wherein\s*', '', sentence)
+        sentence = re.sub(r'characterized\s*|characterised\s*', '', sentence)
+        sentence = re.sub(r'characterized in that\s*', '', sentence)
+        sentence = re.sub(r'where\s*', '', sentence)
+        sentence = re.sub(r'where said\s*', '', sentence)
+        sentence = re.sub(r'further comprising', 'the system or method comprises', sentence)
+        sentence = re.sub(r'.*thereof\s*\,?', '', sentence)
+        sentence = re.sub(r'^\s+', '', sentence)
+        sentence = re.sub(r'\s+\.$', '', sentence)
+        if sentence is not None:
+            return sentence
+    def selectLines(self, line, lexic):
+        with open(constants.ASSETS + lexic) as n:
+            inclusion_list = n.read().splitlines()
+            claims_words = re.compile('|'.join(inclusion_list))
+            m = re.search(claims_words, line)
+            if m is not None:
+                return m.group(1)
+                # pass
+            # return line
+    def main(self):
+        output_result = []
+        compt_Id = 50
+        count_concept = 3
+        clean_content_list = []
+        concept_list = []
+        output_content = []
+        uniq_output_linked_content =[]
+        parameters_list = []
+        total_sentences_number =0
+        section = self.section
+        input_folder = self.input_folder
+        file_name = self.file_name
+        file_extension = self.file_extension
+        projectFolder = os.path.basename(os.path.normpath(input_folder))
+        output_file_name = input_folder+"/"+file_name+file_extension.strip("*")
+        root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&'
+        root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?'
+        if file_name is not None:
+            match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', file_name)
+            # CC for country code
+            CC = match.group(1)
+            #NR for Number
+            NR = match.group(2)
+            NR = re.sub(r'\s', '', NR)
+            #KC for Kind code
+            KC = match.group(4)
+            urlImg = root_img_url+'&CC='+CC+'&NR='+NR+'&KC='+KC
+            urlPDF = root_pdf_url+'CC='+CC+'&NR='+NR+'&KC='+KC+'&FT=D&ND=3&date='+'&DB=&locale=en_EP#'
+        graphItemId = hashlib.md5(file_name.encode())
+        graphItemIdValue = graphItemId.hexdigest()
+        graphItemIdValue = str(uuid.uuid4())
+        sentence_finder = self.sentence_finder
+        sentences = sentence_finder.tokenize(section.strip())
+        for sentence in sentences:
+            # print(sentence)
+            sentence = self.clean_data(sentence)
+            if sentence !='':
+                clean_content_list.append(sentence)
+        for line in clean_content_list:
+            # print(len(line.split()))
+            if not re.match(r'^\s*$', line):
+                line = self.selectLines(line, 'claims_indices')
+                if line is not None and count_concept > 0:
+                    line = self.truncate_data(line)
+                    line = re.sub(r'in that', '', line)
+                    # print(line, len(line.split()))
+                    concept_list.append(line)
+                    count_concept -= 1
+        count_concept = 3
+        if len(concept_list) is not None:
+            total_sentences_number = len(concept_list)
+            for concept in concept_list :
+                if concept is not None and not re.match(r'^\s,', concept) and len(concept.split())<50:
+                    classifyT = ClassifyWithIncr_it()
+                    polarite = classifyT.main(concept)
+                    get_parameters = ParameterExtractor(concept)
+                    parameters = get_parameters.extract_parameters()
+                    parameters_list.extend(parameters)
+                    values = OrderedDict({
+                        "concept": {
+                            "type": polarite,
+                            "id": graphItemIdValue + str(compt_Id),
+                            "sentence": concept,
+                            "source": output_file_name,
+                            "parameters": parameters_list,
+                            "image": urlImg,
+                            "pdf": urlPDF
+                        }
+                    })
+                    json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
+                    output_result.append(json_string)
+                    output_result = list(set(output_result))
+                output_json = ",".join(output_result)
+                return output_json, total_sentences_number

App/bin/InputHandler.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# -*- coding: utf-8 -*-
+#java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer --port 8080
+import glob
+import os
+class InputHandler(object):
+    def __init__(self, folder_path, extension):
+        self.folder_path = folder_path
+        self.extension = extension
+        print("Handling Corpus...")
+    def _get_dirs(self, base):
+        return [x for x in glob.iglob(os.path.join(base, '*')) if os.path.isdir(x)]
+    def get_base_file(self, base, pattern):
+        lList = []
+        lList.extend(glob.glob(os.path.join(base, pattern)))
+        dirs = self._get_dirs(base)
+        if len(dirs):
+            for d in dirs:
+                lList.extend(self.get_base_file(os.path.join(base, d), pattern))
+        return lList
+    def get_input(self):
+        folder_path = self.folder_path
+        extension = self.extension
+        patent_files = self.get_base_file(folder_path, extension)
+        return patent_files

App/bin/MagicParser.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import json
+from App.bin import constants
+class MagicParser(object):
+    def __init__(self, jsonFile):
+        self.jsonFile = jsonFile
+    def get_graph(self):
+        jsonFile = self.jsonFile
+        with open(jsonFile) as data_file:
+            data = json.load(data_file)
+        return data
+    def magic_parse(self):
+        count_problem = 0
+        count_partial_solution = 0
+        count_concepts = 0
+        count_parameters = 0
+        parameters = []
+        graph = self.get_graph(self.json_file)
+        for item in graph['problem_graph']:
+            count_concepts +=1
+            for sub_item, value in item.items():
+                if value['type'] =='partialSolution':
+                    count_partial_solution +=1
+                else:
+                    count_problem +=1
+        for item in graph['parameters']:
+            for sub_item, value in item.items():
+                for id, parameter in value['valeurs'].items():
+                    parameters.append(parameter)
+                    count_parameters += 1
+        uniq_parameters_number = len(list(set(parameters)))
+        return  {"concepts_number":count_concepts, "problems_number": count_problem, "partialSol_numbers":count_partial_solution, "parameters_number": count_parameters, "uniq_param_number": uniq_parameters_number}

App/bin/PGProcessor.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# -*- coding: utf-8 -*-
+import json
+import os
+import re
+import matplotlib.pyplot as plt
+import numpy as np
+from io import StringIO
+from App4api.bin import constants
+from collections import OrderedDict
+from App4api.bin.InformationExtractor import InformationExtractor
+from App4api.bin.ParameterExtractor import ParameterExtractor
+from App4api.bin.TechnologyFinder import TechnologyFinder
+class PGProcessor(object):
+    def __init__(self, patents,input_folder, file_extension):
+        self.patents = patents
+        self.input_folder = input_folder
+        self.file_extension = file_extension
+        print("Processing started")
+    def process_corpus(self):
+        count_abstract = 0
+        count_claims = 0
+        count_description = 0
+        count_patent = 0
+        total_sentences_number =0
+        count_concepts_solupart = 0
+        count_concepts_problem = 0
+        patents = self.patents
+        input_folder = self.input_folder
+        file_extension = self.file_extension
+        project_folder = os.path.basename(os.path.normpath(input_folder))
+        graph_folder = constants.GRAPH_FOLDER + project_folder+"/"
+        extracted_concepts = []
+        output_result = []
+        parameters_graph = []
+        reduced_content = []
+        patent_corpus = []
+        source_list = []
+        parameters_list =[]
+        technologies_graph =[]
+        for patent_file in patents:
+            read_patent = StringIO(patent_file)
+            patent = json.load(read_patent)
+            nNumber = patent['number']
+            aAbstract = patent['abstract']
+            cClaims = patent['claims']
+            dDescription = patent['description']
+            source = patent['source']
+            if dDescription !="":
+                count_description +=1
+                extract_concepts = InformationExtractor(dDescription,input_folder, file_extension, nNumber, source )
+                output_json, total_sentences_number = extract_concepts.get_from_description()
+                if output_json !="":
+                    extracted_concepts.append(output_json)
+                total_sentences_number += total_sentences_number
+            elif cClaims !="":
+                count_claims +=1
+                print('Processing claims')
+            else:
+                count_abstract +=1
+                print("processing abstract")
+            count_patent +=1
+            #print(source)
+            source_list.append(source)
+        header = '{'
+        graph = '"problem_graph": [%s]' % ','.join(extracted_concepts)
+        footer = '}'
+        output_result.extend((header, graph, footer))
+        output_result = "".join(output_result)
+        concepts_json = json.loads(output_result)
+        count_concepts = len(concepts_json['problem_graph'])
+        for item, value in concepts_json.items():
+            #if cle == "type" and value =="partialSolution":
+             #   print ("yes")
+            for element in value:
+                for cle, valeur in element.items():
+                    for k,v in valeur.items():
+                        if k == "type" and v =="partialSolution":
+                            count_concepts_solupart += 1
+                        elif k == "type" and v =="problem":
+                            count_concepts_problem += 1
+        json_write_to_file = json.dumps(concepts_json, sort_keys=False, indent=4, separators=(',', ': '))
+        #print(concepts_json.keys())
+        with open(graph_folder+"graph.json", 'w') as json_graph:
+            json_graph.write(json_write_to_file)
+        print("Le corpus contenait %s brevets dont %s abstract, %s revendications et %s descriptions" % (count_patent, count_abstract, count_claims, count_description))
+        print("%s phrases ont été analysée(s)" % (total_sentences_number))
+        print("%s concepts ont été trouvé(s) dont %s problèmes et %s solutions partielles" % (count_concepts, count_concepts_problem, count_concepts_solupart))
+        #Display graphics
+        first_color = (46, 204, 113)
+        second_color = (245, 176, 65)
+        #self.make_graphic([count_concepts_problem, count_concepts_solupart], "Ratio",[first_color,second_color],['Problems','Partial Solutions'])
+        return concepts_json

App/bin/ParamProcessor.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# -*- coding: utf-8 -*-
+import json
+import os
+import re
+import matplotlib.pyplot as plt
+import numpy as np
+from io import StringIO
+from App4api.bin import constants
+from collections import OrderedDict
+from App4api.bin.InformationExtractor import InformationExtractor
+from App4api.bin.ParameterExtractor import ParameterExtractor
+from App4api.bin.TechnologyFinder import TechnologyFinder
+class ParamProcessor(object):
+    def __init__(self, patents,input_folder, file_extension):
+        self.patents = patents
+        self.input_folder = input_folder
+        self.file_extension = file_extension
+        print("Processing started")
+    def change_keys(self, dictionnary, number):
+        number = number+'-'
+        if type(dictionnary) is dict:
+            return dict([(number+str(k) , self.change_keys(v, number)) for k, v in dictionnary.items()])
+        else:
+            return dictionnary
+    def process_corpus(self):
+        count_patent = 0
+        patents = self.patents
+        input_folder = self.input_folder
+        project_folder = os.path.basename(os.path.normpath(input_folder))
+        graph_folder = constants.GRAPH_FOLDER + project_folder+"/"
+        output_result = []
+        parameters_graph = []
+        reduced_content = []
+        patent_corpus = []
+        source_list = []
+        parameters_list =[]
+        for patent_file in patents:
+            read_patent = StringIO(patent_file)
+            patent = json.load(read_patent)
+            nNumber = patent['number']
+            aAbstract = patent['abstract']
+            cClaims = patent['claims']
+            dDescription = patent['description']
+            source = patent['source']
+            patent_content = aAbstract + cClaims + dDescription
+            patent_content = patent_content.splitlines()
+            for line in patent_content:
+                get_parameters = ParameterExtractor(line)
+                parameters = get_parameters.extract_parameters()
+                if parameters:
+                    parameters_list.extend( parameters)
+            parameters_list=list(set(parameters_list))
+            parameters = dict(enumerate(parameters_list, 1))
+            parameters = self.change_keys(parameters, nNumber.lower())
+            parameters_array = OrderedDict({
+                        "concept": {
+                            "source": source,
+                            "valeurs": parameters,
+                        }
+                    })
+            pParameters= json.dumps(parameters_array, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
+            parameters_graph.append(pParameters)
+            count_patent +=1
+            source_list.append(source)
+            patent_corpus.append(reduced_content)
+        header = '{'
+        parameters_output = '"parameters": [%s]' % ','.join(parameters_graph)
+        footer = '}'
+        output_result.extend((header, parameters_output,  footer))
+        output_result = "".join(output_result)
+        concepts_json = json.loads(output_result)
+        json_write_to_file = json.dumps(concepts_json, sort_keys=False, indent=4, separators=(',', ': '))
+        with open(graph_folder+"parameters-graph.json", 'w') as json_graph:
+            json_graph.write(json_write_to_file)
+        return concepts_json

App/bin/ParameterExtractor.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# -*- coding: utf-8 -*-
+import re
+import nltk
+import Levenshtein
+from App.bin import constants
+class ParameterExtractor(object):
+    def __init__(self, sentence):
+        self.sentence = sentence
+    def clean_parameter(self, parameter):
+        line = re.sub(r'\s[a-zA-Z]$', r'', parameter)
+        line = line.strip()
+        return line
+    def extract_parameters(self):
+        sentence = self.sentence
+        parameters_list = []
+        with open(constants.ASSETS + "parameter_core", 'r') as l:
+            words_list = l.read().splitlines()
+            match_word = re.compile(r'(\b(?:%s)\b)' % '|'.join(words_list))
+        with open(constants.ASSETS + "exclude_from_parameters", 'r') as m:
+            not_included_words_list = m.read().splitlines()
+            match_not_included_word = re.compile(r'(\b(?:%s)\b)' % '|'.join(not_included_words_list))
+        parameter_indice = re.search(match_word, sentence)
+        if parameter_indice:
+            words = nltk.word_tokenize(sentence)
+            sentence = nltk.pos_tag(words)
+            grammar = """PARAMETER:{<NN>+<IN><DT>?<NN.*>+}
+                                {<NN*>+}
+                        """
+            parameter_parser = nltk.RegexpParser(grammar)
+            tree = parameter_parser.parse(sentence)
+            for subtree in tree.subtrees():
+                if subtree.label() == 'PARAMETER':
+                    parameter_candidate = " ".join(word for word, tag in subtree.leaves())
+                    parameter_candidate_indice = re.search(match_word, parameter_candidate)
+                    not_parameter = re.search(match_not_included_word, parameter_candidate)
+                    if parameter_candidate_indice and not not_parameter :
+                        #parameter_candidate=self.clean_parameter(parameter_candidate)
+                        parameters_list.append(parameter_candidate)
+        parameters_list = list(set(parameters_list))
+        return list(parameters_list)

App/bin/PatentHandler.py ADDED Viewed

	@@ -0,0 +1,254 @@

+# -*- coding: utf-8 -*-
+#java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer --port 8080
+import glob
+import nltk
+import os
+import re
+import codecs
+import chardet
+import shutil
+import json
+from io import StringIO
+from App.bin import constants
+from App.bin.FiguresCleaner import FiguresCleaner
+from collections import OrderedDict
+class PatentHandler(object):
+    def __init__(self, patents):
+        self.patents = patents
+    def custom_cleaner(self, line):
+        line = str(line)
+        #line = line.lower()
+        line = re.sub(r'PatentInspiration Url', '', line)
+        line = re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', line)
+        line = re.sub(r'{', '(', line)
+        line = re.sub(r'&quot;', '\'', line)
+        line = re.sub(r'}', ')', line)
+        line = re.sub(r'\t.*patentinspiration.*\n', '', line)
+        line = re.sub(r'^|\n{2,}\bAbstract\b\n?', '', line)
+        line = re.sub(r'^|\n{2,}\bClaims\b\n?', '', line)
+        line = re.sub(r'^|\n{2,}\bDescription\b\n?', '', line)
+        line = re.sub(r'fig\.', 'figure', line)
+        line = re.sub(r'Fig\.', 'Figure', line)
+        line = re.sub(r'FIG\.', 'Figure', line)
+        line = re.sub(r'figs\.', 'figures', line)
+        line = re.sub(r'FIGS\.', 'Figures', line)
+        line = re.sub(r'(\w+\.)', r'\1 ', line)
+        line = re.sub(r'&#39;', '\'', line)
+        line = re.sub(r'&gt;', '>', line)
+        line = re.sub(r'&lt;', '<', line)
+        line = re.sub(r'&#176;', ' deg.', line)
+        line = re.sub(r'  ', ' ', line)
+        line = line.strip()
+        return line
+    def dataCleaner(self,line):
+        with open(constants.ASSETS + "dropPart") as l:
+            # next(l)
+            drop_part = l.read().splitlines()
+            drop_part_pattern = re.compile('|'.join(drop_part))
+        line = str(line)
+        #line = line.lower()
+        line = re.sub(r'^([A-Z-/]+\s)+([A-Z])', r'\n\2', line)
+        line = re.sub(drop_part_pattern, r'\n', line)
+        line = re.sub(r'\s+\.\s?\d+\s+', ' ', line)
+        line = line.strip()
+        return line
+    def smooth_data_cleaner(self,line):
+        line = str(line)
+        # line = line.lower()
+        line = re.sub(r'\s+,', ',', line)
+        line = re.sub(r'\d\w-\d\w (and? \d\w-\d\w)?', '', line)
+        line = re.sub(r'\d\w-\d\w', '', line)
+        line = re.sub(r'\(\s?(,\s?|;\s?)+\s?\)', '', line)
+        line = re.sub(r'\s+\.\s\.', '.\n', line)
+        line = re.sub(r'\s+\.\s+([a-z]+)', r' \1', line)
+        line = re.sub(r'\s+(\.)\s+\[\s?\d+\s?]\s+', r'.\n', line)
+        line = re.sub(r'\s?\[\s?\d+\s?]\s+', r'\n', line)
+        line = re.sub(r'\s+(\.)\s+([A-Z]+)', r'.\n\2', line)
+        line = re.sub(r'\s+;\s+', '; ', line)
+        line = re.sub(r'\(\s+\'\s+\)', '', line)
+        line = re.sub(r'\(\s+\)', '', line)
+        line = re.sub(r'\(\s?\.\s?\)', '', line)
+        line = re.sub(r'\(\s/\s?\)', '', line)
+        line = re.sub(r'\s{2,}', ' ', line)
+        line = re.sub(r'(\d+)\s+(\.)\s+(\d+)', r'\1.\3', line)
+        line = line.strip()
+        return line
+    def get_project_folder(self):
+        patents = self.patents
+        if patents:
+            file = patents[0]
+            project_folder = os.path.basename(os.path.dirname(file))
+            return project_folder
+    def convert_to_uf8(self, input_file_name,output_file_name, file_encoding):
+        BLOCKSIZE = 1048576
+        with codecs.open(input_file_name, "r", file_encoding) as input_file:
+            with codecs.open(output_file_name, "w", "utf-8") as output_file:
+                while True:
+                    file_contents = input_file.read(BLOCKSIZE)
+                    if not file_contents:
+                        break
+                    output_file.write(file_contents)
+    def sectionFinder(self, file_name, start_delimiter, end_delimiter):
+        patent_file = open(file_name, encoding='utf-8')
+        section = ""
+        found = False
+        for line in patent_file:
+            if found :
+                section += line
+                if line.strip() == end_delimiter:
+                    break
+            else:
+                if line.strip() == start_delimiter:
+                    found = True
+                    # abstract = "Abstract\n"
+        return section
+    def pretreat_data(self):
+        clean_patent_data= []
+        patents = self.patents
+        project_folder = self.get_project_folder()
+        # original code
+        # corpus_folder = constants.CORPUS + project_folder + "/"
+        corpus_folder = str(constants.CORPUS)+str(project_folder)+"/"
+        temp_folder = str(constants.TEMP)+str(project_folder)+"/"
+        graph_folder = str(constants.GRAPH_FOLDER)+str(project_folder)+"/"
+        folders = [corpus_folder, temp_folder, graph_folder]
+        for folder in folders:
+            if not os.path.exists(folder):
+                os.makedirs(folder)
+            else:
+                shutil.rmtree(folder)
+                os.makedirs(folder)
+        for patent in patents:
+            patent_name_with_extension = os.path.basename(patent)
+            patent_name, extension= patent_name_with_extension.split('.')
+            corpus_patent_path = corpus_folder + patent_name_with_extension
+            #temp_patent_path = temp_folder + patent_name+'.json'
+            patent_binary = open(patent, 'rb').read()
+            file_encoding = chardet.detect(patent_binary)
+            file_encoding = file_encoding['encoding']
+            self.convert_to_uf8(patent,corpus_patent_path, file_encoding)
+            temp_file = StringIO()
+            #print(temp_patent_path)
+            a_abstract = self.sectionFinder(corpus_patent_path,"Abstract", "Claims")
+            a_abstract= self.custom_cleaner(a_abstract)
+            abstract_cleaner = FiguresCleaner(a_abstract)
+            a_abstract = ''.join(abstract_cleaner.clean_figures())
+            a_abstract = self.smooth_data_cleaner(a_abstract)
+            a_abstract = self.dataCleaner(a_abstract)
+            c_claims = self.sectionFinder(corpus_patent_path, "Claims", "")
+            c_claims = self.custom_cleaner(c_claims)
+            claims_cleaner = FiguresCleaner(c_claims)
+            c_claims = ''.join(claims_cleaner.clean_figures())
+            c_claims = self.smooth_data_cleaner(c_claims)
+            c_claims = self.smooth_data_cleaner(c_claims)
+            d_description = self.sectionFinder(corpus_patent_path,"Description", "Claims")
+            d_description = self.custom_cleaner(d_description)
+            description_cleaner = FiguresCleaner(d_description)
+            d_description = ''.join(description_cleaner.clean_figures())
+            d_description = self.smooth_data_cleaner(d_description)
+            d_description = self.dataCleaner(d_description)
+    #TODO Manipulate data on system memory.
+            data = {
+                'number': patent_name,
+                'abstract': a_abstract,
+                'claims': c_claims,
+                'description': d_description
+            }
+            json.dump(data, temp_file)
+            clean_patent_data.append(temp_file.getvalue())
+        return clean_patent_data
+    def pretreat_json(self):
+        clean_patent_data= []
+        patents = self.patents
+        temp_file = StringIO()
+        for patent in patents:
+            patent = json.dumps(patent)
+            read_patent_t = StringIO(patent)
+            patent_section = json.load(read_patent_t)
+            filename = patent_section['filename']
+            number = patent_section['number']
+            a_abstract = patent_section['abstract']
+            a_abstract= self.custom_cleaner(a_abstract)
+            abstract_cleaner = FiguresCleaner(a_abstract)
+            a_abstract = ''.join(abstract_cleaner.clean_figures())
+            a_abstract = self.smooth_data_cleaner(a_abstract)
+            a_abstract = self.dataCleaner(a_abstract)
+            c_claims = patent_section['claims']
+            c_claims = self.custom_cleaner(c_claims)
+            claims_cleaner = FiguresCleaner(c_claims)
+            c_claims = ''.join(claims_cleaner.clean_figures())
+            c_claims = self.smooth_data_cleaner(c_claims)
+            c_claims = self.smooth_data_cleaner(c_claims)
+            d_description = patent_section['description']
+            d_description = self.custom_cleaner(d_description)
+            description_cleaner = FiguresCleaner(d_description)
+            d_description = ''.join(description_cleaner.clean_figures())
+            d_description = self.smooth_data_cleaner(d_description)
+            d_description = self.dataCleaner(d_description)
+    #TODO Manipulate data on system memory.
+            data = {
+                'filename': filename,
+                'number': number,
+                'abstract': a_abstract,
+                'claims': c_claims,
+                'description': d_description
+            }
+            clean_patent_data.append(data)
+            #json.dumps(clean_patent_data, temp_file)
+        #print(json.dumps(clean_patent_data))
+        return clean_patent_data

App/bin/SentenceClassifier.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# -*- coding: utf-8 -*-
+import nltk
+from App.bin import constants
+class SentenceClassifier(object):
+    def __init__(self, sentence):
+        self.sentence = sentence
+        print("Classification....")
+    def classifySentence(self):
+        sentence = self.sentence
+        def bagOfWords(labelled):
+            wordsList = []
+            for (words, sentiment) in labelled:
+                wordsList.extend(words)
+            return wordsList
+        def wordFeatures(wordList):
+            wordList = nltk.FreqDist(wordList)
+            wordFeatures = wordList.keys()
+            return wordFeatures
+        def extract_Features(doc):
+            docWords = set(doc)
+            feat = {}
+            for word in wordFeatures:
+                feat['contains(%s)' % word] = (word in docWords)
+            return feat
+        with open(constants.ASSETS+"trainingsNegative") as l:
+            problems = [tuple(map(str, i.strip().split(':'))) for i in l]
+        with open(constants.ASSETS+"trainingsPositive") as f:
+            solutions = [tuple(map(str, i.strip().split(':'))) for i in f]
+        labelled = []
+        for (words, polarity) in solutions + problems:
+            words_filtered = [e.lower() for e in nltk.word_tokenize(words) if len(e) >= 3]
+            labelled.append((words_filtered, polarity))
+        wordFeatures = wordFeatures(bagOfWords(labelled))
+        training_set = nltk.classify.apply_features(extract_Features, labelled)
+        classifier = nltk.NaiveBayesClassifier.train(training_set)
+        #print(classifier.show_most_informative_features(32))
+        #print (sentence)
+        #print("{0} \n Polarity: {1} \n".format(sentence, classifier.classify(extract_Features(sentence.split()))))
+        classes = classifier.classify(extract_Features(sentence.split()))
+        return classes