Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,14 @@ Address(
0.0
```

## 1.3 自定义地址设置
## 1.3 自定义地址文件设置

```kotlin
// 文件生成方式见下文
val geocoding = GeocodingX("region_2021.dat")
```

## 1.4 自定义地址设置

```kotlin
// 100000000000 代表中国的ID
Expand Down Expand Up @@ -128,20 +135,31 @@ Address(
# 2. 说明

## 2.1 标准地址库
项目目前采用的是 [淘宝物流4级地址][1] 的标准地址库, 也可以采用[国家的标准地址库][2] (对应的github库, [中国5级行政区域mysql库][3]).
项目目前采用的是 [淘宝物流4级地址][1] 的标准地址库, 也可以采用`国家的标准地址库` (对应的github库, [中国5级行政区域mysql库][3]).

* [国家标准地址库2015][2]
* [国家标准地址库2021](http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/)

### 导入中国5级行政区域mysql库注意事项

[参考文档](https://github.com/bitlap/geocoding/blob/master/src/test/java/org/bitlap/geocoding/region/README.md)

## 2.2 标准化
## 2.2 标准地址库(兼容本项目)

| 标准库文件 | 描述 | 参考 | 感谢 |
|----------------|-------------|-------------------------------------------------------------|--------------------------------------------------------------------------------------|
| region_2021.dat | 国家标准地址库2021 | [ISSUE-163](https://github.com/bitlap/geocoding/issues/163) | [TsLenMo](https://github.com/TsLenMo)、[weijiang.lin](https://github.com/linweijiang) |

使用方式:文件下载到`classpath`,使用自定义的`GeocodingX`类即可。

## 2.3 标准化
1. 首先基于正则提取出道路、建筑物号等信息
2. 省市区等匹配
1. 将标准的地址库建立**倒排索引**
2. 将文本从起始位置开始, 采用**最大长度优先**的方式匹配所有词条
3. 对所有匹配结果进行标准行政区域从属关系校验

## 2.3 相似度计算
## 2.4 相似度计算
1. 对输入的两个地址进行标准化
2. 对省市区等信息分配不同的权重
3. 对道路号, 建筑号进行语义处理, 分配权重
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>org.bitlap</groupId>
<artifactId>geocoding</artifactId>
<version>1.2.0</version>
<version>1.3.0</version>

<name>geocoding</name>
<description>地理编码技术,提供地址标准化和相似度计算。</description>
Expand Down
67 changes: 26 additions & 41 deletions src/main/java/org/bitlap/geocoding/Geocoding.kt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package org.bitlap.geocoding;

import org.bitlap.geocoding.core.Context
import org.bitlap.geocoding.model.Address
import org.bitlap.geocoding.model.Address.Companion.build
import org.bitlap.geocoding.model.RegionEntity
import org.bitlap.geocoding.model.RegionType
import org.bitlap.geocoding.similarity.Document
Expand All @@ -16,62 +15,62 @@ import org.bitlap.geocoding.similarity.MatchedResult
*/
object Geocoding {

@JvmField
val DEFAULT = GeocodingX()

/**
* 地址的标准化, 将不规范的地址清洗成标准的地址格式
*/
@JvmStatic
fun normalizing(address: String): Address? {
return build(Context.getInterpreter().interpret(address))
return DEFAULT.normalizing(address)
}

/**
* 将地址进行切分
*/
@JvmStatic
fun analyze(address: String): Document? {
val addr = normalizing(address) ?: return null
return Context.getComputer().analyze(addr)
return DEFAULT.analyze(address)
}
@JvmStatic
fun analyze(address: Address?): Document? {
address ?: return null
return Context.getComputer().analyze(address)
return DEFAULT.analyze(address)
}

/**
* 地址的相似度计算
*/
@JvmStatic
fun similarity(addr1: String, addr2: String): Double {
val compute = Context.getComputer().compute(
normalizing(addr1),
normalizing(addr2)
)
return compute.similarity
fun similarity(address1: String, address2: String): Double {
return DEFAULT.similarity(address1, address2)
}
@JvmStatic
fun similarity(addr1: Address?, addr2: Address?): Double {
val compute = Context.getComputer().compute(addr1, addr2)
return compute.similarity
fun similarity(address1: Address?, address2: Address?): Double {
return DEFAULT.similarity(address1, address2)
}

/**
* 地址相似度计算, 包含匹配的所有结果
*/
@JvmStatic
fun similarityWithResult(addr1: String, addr2: String): MatchedResult {
return Context.getComputer().compute(
normalizing(addr1),
normalizing(addr2)
)
fun similarityWithResult(address1: String, address2: String): MatchedResult {
return DEFAULT.similarityWithResult(address1, address2)
}
@JvmStatic
fun similarityWithResult(addr1: Address?, addr2: Address?): MatchedResult {
return Context.getComputer().compute(addr1, addr2)
fun similarityWithResult(address1: Address?, address2: Address?): MatchedResult {
return DEFAULT.similarityWithResult(address1, address2)
}

/**
* 深度优先匹配符合[text]的地址信息
*/
fun match(text: String): List<RegionEntity> {
return DEFAULT.match(text)
}

@JvmStatic
fun getContext(): Context = Context
fun getContext(): Context = DEFAULT.ctx

/**
* 设置自定义地址
Expand All @@ -81,25 +80,11 @@ object Geocoding {
* @param name 地址的名称
* @param type 地址类型, [RegionType]
* @param alias 地址的别名
* @param replace 是否替换旧地址, 当除了[id]之外的字段, 如果相等就替换
*/
@JvmStatic
fun addRegionEntry(id: Long, parentId: Long, name: String, type: RegionType = RegionType.Undefined, alias: String = "") {
val persister = getContext().getPersister()
persister.getRegion(parentId) ?: throw IllegalArgumentException("Parent Address is not exists, parentId is $parentId")
if (name.isBlank()) {
throw IllegalArgumentException("name should not be blank.")
}
// 构建 region 对象
val region = RegionEntity()
region.id = id
region.parentId = parentId
region.name = name
region.alias = alias
region.type = type
// 1. Add to cache (id -> Region)
persister.addRegionEntity(region)
// 2. Build term index
val indexBuilder = getContext().getInterpreter().getTermIndexBuilder()
indexBuilder.indexRegions(listOf(region))
fun addRegionEntry(id: Long, parentId: Long, name: String, type: RegionType = RegionType.Undefined, alias: String = "", replace: Boolean = true): Geocoding {
DEFAULT.addRegionEntry(id, parentId, name, type, alias, replace)
return this
}
}
111 changes: 111 additions & 0 deletions src/main/java/org/bitlap/geocoding/GeocodingX.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package org.bitlap.geocoding

import org.bitlap.geocoding.core.Context
import org.bitlap.geocoding.model.Address
import org.bitlap.geocoding.model.RegionEntity
import org.bitlap.geocoding.model.RegionType
import org.bitlap.geocoding.similarity.Document
import org.bitlap.geocoding.similarity.MatchedResult


/**
* Create custom geocoding
*/
open class GeocodingX(val ctx: Context) {

constructor(): this(false)
constructor(strict: Boolean): this("core/region.dat", strict)
constructor(dataClassPath: String): this(dataClassPath, false)

/**
* @param dataClassPath 自定义地址文档的classpath路径
* @param strict 解析模式, 默认为false。当发现没有省和市,且匹配的父项数量等于1时,能成功匹配。
* * true: 严格模式,当发现没有省和市,且匹配的父项数量大于0时,返回null
* * false: 非严格模式,当发现没有省和市,且匹配的父项数量大于0时,匹配随机一项省和市
*/
constructor(dataClassPath: String, strict: Boolean): this(Context(dataClassPath, strict))

/**
* 地址的标准化, 将不规范的地址清洗成标准的地址格式
*/
fun normalizing(address: String): Address? {
return Address.build(ctx.interpreter.interpret(address))
}

/**
* 将地址进行切分
*/
fun analyze(address: String): Document? {
val add = normalizing(address) ?: return null
return ctx.computer.analyze(add)
}
fun analyze(address: Address?): Document? {
address ?: return null
return ctx.computer.analyze(address)
}

/**
* 地址的相似度计算
*/
fun similarity(address1: String, address2: String): Double {
val compute = ctx.computer.compute(
normalizing(address1),
normalizing(address2)
)
return compute.similarity
}
fun similarity(address1: Address?, address2: Address?): Double {
val compute = ctx.computer.compute(address1, address2)
return compute.similarity
}

/**
* 地址相似度计算, 包含匹配的所有结果
*/
fun similarityWithResult(address1: String, address2: String): MatchedResult {
return ctx.computer.compute(
normalizing(address1),
normalizing(address2)
)
}
fun similarityWithResult(address1: Address?, address2: Address?): MatchedResult {
return ctx.computer.compute(address1, address2)
}

/**
* 深度优先匹配符合[text]的地址信息
*/
fun match(text: String): List<RegionEntity> {
val terms = ctx.interpreter.getTermIndexBuilder().fullMatch(text) ?: emptyList()
return terms.mapNotNull { it.value }
}

/**
* 设置自定义地址
*
* @param id 地址的ID
* @param parentId 地址的父ID, 必须存在
* @param name 地址的名称
* @param type 地址类型, [RegionType]
* @param alias 地址的别名
* @param replace 是否替换旧地址, 当除了[id]之外的字段, 如果相等就替换
*/
fun addRegionEntry(id: Long, parentId: Long, name: String, type: RegionType = RegionType.Undefined, alias: String = "", replace: Boolean = true) {
ctx.persister.getRegion(parentId) ?: throw IllegalArgumentException("Parent Address is not exists, parentId is $parentId")
if (name.isBlank()) {
throw IllegalArgumentException("name should not be blank.")
}
// 构建 region 对象
val region = RegionEntity()
region.id = id
region.parentId = parentId
region.name = name
region.alias = alias
region.type = type
// 1. Add to cache (id -> Region)
ctx.persister.addRegionEntity(region)
// 2. Build term index
val indexBuilder = ctx.interpreter.getTermIndexBuilder()
indexBuilder.indexRegions(listOf(region), replace)
}
}
71 changes: 10 additions & 61 deletions src/main/java/org/bitlap/geocoding/core/Context.kt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package org.bitlap.geocoding.core

import org.bitlap.geocoding.core.impl.DefaultAddressInterpreter
import org.bitlap.geocoding.core.impl.DefaultAddressPersister
import org.bitlap.geocoding.core.impl.DefaultRegoinCache
import org.bitlap.geocoding.core.impl.DefaultRegionCache
import org.bitlap.geocoding.core.impl.RegionInterpreterVisitor
import org.bitlap.geocoding.core.impl.SimilarityComputer

Expand All @@ -12,66 +12,15 @@ import org.bitlap.geocoding.core.impl.SimilarityComputer
* Created by IceMimosa
* Date: 2017/1/12
*/
object Context {
open class Context(
val dataClassPath: String,
val strict: Boolean,
val persister: AddressPersister = DefaultAddressPersister(DefaultRegionCache(dataClassPath)),
val visitor: TermIndexVisitor = RegionInterpreterVisitor(persister, strict),
val interpreter: AddressInterpreter = DefaultAddressInterpreter(persister, visitor),
val computer: Computer = SimilarityComputer(),
) {

private var interpreter: AddressInterpreter? = null
private var persister: AddressPersister? = null
private var computer: Computer? = null

init {
// region entity默认, 此处暂时直接实例化
persister = DefaultAddressPersister(DefaultRegoinCache())
// 实例化
interpreter = DefaultAddressInterpreter()
// 计算类
computer = SimilarityComputer()
}

// 获取 AddressInterpreter
fun getInterpreter(): AddressInterpreter {
interpreter ?: throw IllegalArgumentException("[Context] -> 地址解析服务类初始化失败.")
return interpreter!!
}

// 获取 AddressPersister
fun getPersister(): AddressPersister {
persister ?: throw IllegalArgumentException("[Context] -> 地址持久化服务类初始化失败.")
return persister!!
}

// 获取 visitor
fun getVisitor(): TermIndexVisitor {
return RegionInterpreterVisitor(getPersister())
}

// 获取 计算类
fun getComputer(): Computer {
computer ?: throw IllegalArgumentException("[Context] -> 地址计算服务类初始化失败.")
return computer!!
}


///////////////////////
// Open API
///////////////////////


fun registInterpreter(interpreter: AddressInterpreter) {
synchronized(this) {
this.interpreter = interpreter
}
}

fun registPersister(persister: AddressPersister) {
synchronized(this) {
this.persister = persister
}
}

fun registComputer(computer: Computer) {
synchronized(this) {
this.computer = computer
}
}

}
}
Loading