|
|
--- |
|
|
license: mit |
|
|
datasets: |
|
|
- zerofata/Instruct-Anime |
|
|
- zerofata/Roleplay-Anime-Characters |
|
|
- zerofata/Instruct-Anime-CreativeWriting |
|
|
- zerofata/Summaries-Anime-FandomPages |
|
|
base_model: |
|
|
- zai-org/GLM-4.5-Air |
|
|
--- |
|
|
<style> |
|
|
.container { |
|
|
--primary-accent: #2B8CCC; |
|
|
--secondary-accent: #87CEEB; |
|
|
--glow-primary: rgba(43, 140, 204, 0.4); |
|
|
--glow-secondary: rgba(135, 206, 235, 0.6); |
|
|
|
|
|
--bg-main: #F8FEFF; |
|
|
--bg-container: #FFFFFF; |
|
|
--bg-card: rgba(240, 248, 255, 0.9); |
|
|
|
|
|
--text-main: #2C3E50; |
|
|
--text-muted: #546E7A; |
|
|
--white: #FFFFFF; |
|
|
--border-color: #B0E0E6; |
|
|
|
|
|
--font-title: 'Inter', sans-serif; |
|
|
--font-body: 'Source Sans Pro', sans-serif; |
|
|
--font-code: 'JetBrains Mono', monospace; |
|
|
|
|
|
font-family: var(--font-body); |
|
|
color: var(--text-main); |
|
|
line-height: 1.6; |
|
|
font-weight: 400; |
|
|
|
|
|
max-width: 1100px; |
|
|
margin: 20px auto; |
|
|
padding: 25px; |
|
|
background-color: var(--bg-main); |
|
|
background-image: |
|
|
linear-gradient(135deg, rgba(240, 248, 255, 0.9), rgba(255, 255, 255, 0.7)), |
|
|
radial-gradient(circle at 20% 80%, rgba(135, 206, 235, 0.1) 0%, transparent 50%), |
|
|
radial-gradient(circle at 80% 20%, rgba(176, 224, 230, 0.15) 0%, transparent 50%); |
|
|
min-height: calc(100vh - 40px); |
|
|
|
|
|
border-radius: 12px; |
|
|
box-shadow: 0 8px 32px rgba(43, 140, 204, 0.15), 0 2px 8px rgba(135, 206, 235, 0.1); |
|
|
border: 2px solid var(--border-color); |
|
|
} |
|
|
|
|
|
.container .title-container { |
|
|
background: linear-gradient(135deg, rgba(255, 255, 255, 0.95), rgba(240, 248, 255, 0.9)); |
|
|
backdrop-filter: blur(10px); |
|
|
margin-bottom: 30px; |
|
|
border: 2px solid var(--border-color); |
|
|
border-radius: 16px; |
|
|
padding: 35px; |
|
|
text-align: center; |
|
|
position: relative; |
|
|
box-shadow: |
|
|
0 8px 32px rgba(43, 140, 204, 0.12), |
|
|
inset 0 1px 0 rgba(255, 255, 255, 0.8); |
|
|
overflow: hidden; |
|
|
} |
|
|
|
|
|
/* FLAIR: Dense 24-Spoke Snowflake */ |
|
|
.container .title-container::before { |
|
|
content: ''; |
|
|
position: absolute; |
|
|
top: 50%; |
|
|
left: 50%; |
|
|
width: 350px; |
|
|
height: 350px; |
|
|
margin: -175px 0 0 -175px; |
|
|
background-image: |
|
|
radial-gradient(circle at center, transparent 35%, rgba(135, 206, 235, 0.25) 36%, transparent 37%), |
|
|
conic-gradient(from 0deg, |
|
|
rgba(176, 224, 230, 0.18) 0deg, |
|
|
transparent 7.5deg, |
|
|
rgba(135, 206, 235, 0.15) 15deg, |
|
|
transparent 22.5deg, |
|
|
rgba(176, 224, 230, 0.18) 30deg, |
|
|
transparent 37.5deg, |
|
|
rgba(135, 206, 235, 0.15) 45deg, |
|
|
transparent 52.5deg, |
|
|
rgba(176, 224, 230, 0.18) 60deg, |
|
|
transparent 67.5deg, |
|
|
rgba(135, 206, 235, 0.15) 75deg, |
|
|
transparent 82.5deg, |
|
|
rgba(176, 224, 230, 0.18) 90deg, |
|
|
transparent 97.5deg, |
|
|
rgba(135, 206, 235, 0.15) 105deg, |
|
|
transparent 112.5deg, |
|
|
rgba(176, 224, 230, 0.18) 120deg, |
|
|
transparent 127.5deg, |
|
|
rgba(135, 206, 235, 0.15) 135deg, |
|
|
transparent 142.5deg, |
|
|
rgba(176, 224, 230, 0.18) 150deg, |
|
|
transparent 157.5deg, |
|
|
rgba(135, 206, 235, 0.15) 165deg, |
|
|
transparent 172.5deg, |
|
|
rgba(176, 224, 230, 0.18) 180deg, |
|
|
transparent 187.5deg, |
|
|
rgba(135, 206, 235, 0.15) 195deg, |
|
|
transparent 202.5deg, |
|
|
rgba(176, 224, 230, 0.18) 210deg, |
|
|
transparent 217.5deg, |
|
|
rgba(135, 206, 235, 0.15) 225deg, |
|
|
transparent 232.5deg, |
|
|
rgba(176, 224, 230, 0.18) 240deg, |
|
|
transparent 247.5deg, |
|
|
rgba(135, 206, 235, 0.15) 255deg, |
|
|
transparent 262.5deg, |
|
|
rgba(176, 224, 230, 0.18) 270deg, |
|
|
transparent 277.5deg, |
|
|
rgba(135, 206, 235, 0.15) 285deg, |
|
|
transparent 292.5deg, |
|
|
rgba(176, 224, 230, 0.18) 300deg, |
|
|
transparent 307.5deg, |
|
|
rgba(135, 206, 235, 0.15) 315deg, |
|
|
transparent 322.5deg, |
|
|
rgba(176, 224, 230, 0.18) 330deg, |
|
|
transparent 337.5deg, |
|
|
rgba(135, 206, 235, 0.15) 345deg, |
|
|
transparent 352.5deg, |
|
|
rgba(176, 224, 230, 0.18) 360deg |
|
|
); |
|
|
mask: radial-gradient(circle at center, black 65%, transparent 75%); |
|
|
-webkit-mask: radial-gradient(circle at center, black 65%, transparent 75%); |
|
|
z-index: 1; |
|
|
pointer-events: none; |
|
|
} |
|
|
|
|
|
.container .title-container::after { |
|
|
content: ''; |
|
|
position: absolute; |
|
|
top: 50%; |
|
|
left: 50%; |
|
|
width: 180px; |
|
|
height: 180px; |
|
|
margin: -90px 0 0 -90px; |
|
|
background: conic-gradient(from 0deg, |
|
|
transparent 0deg, |
|
|
rgba(43, 140, 204, 0.12) 5deg, |
|
|
transparent 10deg, |
|
|
rgba(43, 140, 204, 0.12) 35deg, |
|
|
transparent 40deg, |
|
|
rgba(43, 140, 204, 0.12) 65deg, |
|
|
transparent 70deg, |
|
|
rgba(43, 140, 204, 0.12) 95deg, |
|
|
transparent 100deg, |
|
|
rgba(43, 140, 204, 0.12) 125deg, |
|
|
transparent 130deg, |
|
|
rgba(43, 140, 204, 0.12) 155deg, |
|
|
transparent 160deg, |
|
|
rgba(43, 140, 204, 0.12) 185deg, |
|
|
transparent 190deg, |
|
|
rgba(43, 140, 204, 0.12) 215deg, |
|
|
transparent 220deg, |
|
|
rgba(43, 140, 204, 0.12) 245deg, |
|
|
transparent 250deg, |
|
|
rgba(43, 140, 204, 0.12) 275deg, |
|
|
transparent 280deg, |
|
|
rgba(43, 140, 204, 0.12) 305deg, |
|
|
transparent 310deg, |
|
|
rgba(43, 140, 204, 0.12) 335deg, |
|
|
transparent 340deg |
|
|
); |
|
|
mask: radial-gradient(circle at center, transparent 25%, black 30%, black 40%, transparent 45%); |
|
|
-webkit-mask: radial-gradient(circle at center, transparent 25%, black 30%, black 40%, transparent 45%); |
|
|
z-index: 1; |
|
|
pointer-events: none; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
.container .title-container .title-wrapper { |
|
|
position: relative; |
|
|
z-index: 2; |
|
|
} |
|
|
|
|
|
.container .title-main { |
|
|
color: var(--text-main); |
|
|
font-size: 3.2rem; |
|
|
font-weight: 900; |
|
|
margin: 0; |
|
|
letter-spacing: 4px; |
|
|
display: block; |
|
|
text-transform: uppercase; |
|
|
background: linear-gradient(135deg, var(--primary-accent), var(--secondary-accent)); |
|
|
background-clip: text; |
|
|
-webkit-background-clip: text; |
|
|
-webkit-text-fill-color: transparent; |
|
|
font-family: var(--font-title); |
|
|
filter: drop-shadow(0 4px 8px rgba(43, 140, 204, 0.4)) drop-shadow(0 2px 4px rgba(255, 255, 255, 0.6)); |
|
|
text-shadow: |
|
|
0 0 20px rgba(255, 255, 255, 0.8), |
|
|
0 0 40px rgba(135, 206, 235, 0.6), |
|
|
0 4px 8px rgba(43, 140, 204, 0.3); |
|
|
position: relative; |
|
|
} |
|
|
|
|
|
.container .lemonade-text { |
|
|
background: linear-gradient(135deg, var(--secondary-accent), #B0E0E6); |
|
|
background-clip: text; |
|
|
-webkit-background-clip: text; |
|
|
-webkit-text-fill-color: transparent; |
|
|
filter: drop-shadow(0 2px 4px rgba(135, 206, 235, 0.3)); |
|
|
} |
|
|
|
|
|
.container .title-subtitle { |
|
|
padding-left: 0; |
|
|
margin-top: 15px; |
|
|
} |
|
|
|
|
|
.container .subtitle-text { |
|
|
color: var(--text-muted); |
|
|
font-size: 1.2rem; |
|
|
font-family: var(--font-body); |
|
|
font-style: italic; |
|
|
font-weight: 400; |
|
|
letter-spacing: 2px; |
|
|
text-transform: uppercase; |
|
|
opacity: 0.8; |
|
|
} |
|
|
|
|
|
.container img { |
|
|
max-width: 100%; |
|
|
border: 3px solid var(--border-color); |
|
|
margin-bottom: 40px; |
|
|
box-shadow: |
|
|
0 12px 24px rgba(43, 140, 204, 0.15), |
|
|
0 4px 8px rgba(135, 206, 235, 0.1); |
|
|
border-radius: 12px; |
|
|
} |
|
|
|
|
|
.container .section-container { |
|
|
margin-bottom: 30px; |
|
|
padding: 25px; |
|
|
background: rgba(255, 255, 255, 0.6); |
|
|
border: 1px solid var(--border-color); |
|
|
border-radius: 12px; |
|
|
box-shadow: 0 4px 16px rgba(43, 140, 204, 0.08); |
|
|
} |
|
|
.container .section-container:last-of-type { |
|
|
margin-bottom: 0; |
|
|
} |
|
|
|
|
|
.container .section-header { |
|
|
display: flex; |
|
|
align-items: center; |
|
|
padding: 0 0 20px 0; |
|
|
border-bottom: 2px solid var(--border-color); |
|
|
margin-bottom: 20px; |
|
|
} |
|
|
|
|
|
.container .section-title { |
|
|
font-family: var(--font-title); |
|
|
background: linear-gradient(45deg, var(--secondary-accent), var(--primary-accent)); |
|
|
background-clip: text; |
|
|
-webkit-background-clip: text; |
|
|
-webkit-text-fill-color: transparent; |
|
|
font-size: 1.4rem; |
|
|
margin: 0 !important; |
|
|
padding: 0 0 10px 0 !important; |
|
|
letter-spacing: 1px; |
|
|
font-weight: 700; |
|
|
text-transform: uppercase; |
|
|
border: none !important; |
|
|
position: relative; |
|
|
display: inline-block; |
|
|
} |
|
|
|
|
|
.container .section-title::after { |
|
|
content: ''; |
|
|
position: absolute; |
|
|
bottom: -2px; |
|
|
left: 0; |
|
|
width: 100%; |
|
|
height: 3px; |
|
|
background-image: linear-gradient(to right, var(--secondary-accent), var(--primary-accent)); |
|
|
border-radius: 2px; |
|
|
} |
|
|
|
|
|
.container .section-content { |
|
|
padding: 0; |
|
|
} |
|
|
|
|
|
.container .subheading { |
|
|
color: var(--primary-accent); |
|
|
font-size: 1.2rem; |
|
|
margin-top: 25px; |
|
|
margin-bottom: 15px; |
|
|
font-weight: 600; |
|
|
display: block; |
|
|
text-transform: uppercase; |
|
|
letter-spacing: 1px; |
|
|
font-family: var(--font-title); |
|
|
border-bottom: 2px solid var(--primary-accent); |
|
|
padding-bottom: 8px; |
|
|
} |
|
|
|
|
|
.container .data-box { |
|
|
background-color: var(--bg-card); |
|
|
padding: 20px; |
|
|
border: 2px solid var(--border-color); |
|
|
border-left: 4px solid var(--primary-accent); |
|
|
margin-bottom: 20px; |
|
|
box-shadow: 0 4px 12px rgba(43, 140, 204, 0.1); |
|
|
border-radius: 8px; |
|
|
font-size: 1rem; |
|
|
} |
|
|
|
|
|
.container .data-row { |
|
|
display: flex; |
|
|
align-items: center; |
|
|
margin-bottom: 6px; |
|
|
padding: 5px 0; |
|
|
} |
|
|
|
|
|
.container .data-row:last-child { |
|
|
margin-bottom: 0; |
|
|
} |
|
|
|
|
|
.container .data-arrow { |
|
|
color: var(--primary-accent); |
|
|
font-weight: bold; |
|
|
margin-right: 12px; |
|
|
font-family: var(--font-code); |
|
|
font-size: 1.1rem; |
|
|
} |
|
|
|
|
|
.container .data-label { |
|
|
color: var(--text-main); |
|
|
font-weight: 600; |
|
|
font-family: var(--font-body); |
|
|
margin-right: 10px; |
|
|
min-width: 90px; |
|
|
} |
|
|
|
|
|
.container a { |
|
|
color: var(--primary-accent); |
|
|
text-decoration: none; |
|
|
font-weight: 600; |
|
|
transition: all .2s; |
|
|
} |
|
|
|
|
|
.container .data-row a { |
|
|
border-bottom: 1px dotted var(--primary-accent); |
|
|
} |
|
|
|
|
|
.container a:hover { |
|
|
text-decoration: none; |
|
|
color: var(--secondary-accent); |
|
|
transform: translateY(-1px); |
|
|
} |
|
|
|
|
|
.container .data-row a:hover { |
|
|
border-bottom-style: solid; |
|
|
} |
|
|
|
|
|
.container .dropdown-container { |
|
|
margin-top: 20px; |
|
|
} |
|
|
|
|
|
.container .dropdown-summary { |
|
|
cursor: pointer; |
|
|
padding: 10px 0; |
|
|
color: var(--text-muted); |
|
|
font-size: 1.1rem; |
|
|
font-weight: 700; |
|
|
text-transform: none; |
|
|
font-family: var(--font-title); |
|
|
letter-spacing: 1px; |
|
|
list-style: none; |
|
|
transition: color 0.2s ease; |
|
|
} |
|
|
.container .dropdown-summary:hover { |
|
|
color: var(--primary-accent); |
|
|
} |
|
|
|
|
|
.container .dropdown-arrow { |
|
|
color: var(--secondary-accent); |
|
|
margin-right: 10px; |
|
|
transition: transform 0.2s ease; |
|
|
} |
|
|
|
|
|
.container .dropdown-content { |
|
|
margin-top: 15px; |
|
|
padding: 25px; |
|
|
background-color: var(--bg-card); |
|
|
border: 2px solid var(--border-color); |
|
|
border-radius: 8px; |
|
|
box-shadow: 0 4px 12px rgba(43, 140, 204, 0.1); |
|
|
} |
|
|
|
|
|
.container .config-title { |
|
|
color: var(--text-muted); |
|
|
font-size: 1rem; |
|
|
margin-bottom: 10px; |
|
|
font-family: var(--font-body); |
|
|
text-transform: uppercase; |
|
|
letter-spacing: 1px; |
|
|
font-weight: 700; |
|
|
} |
|
|
|
|
|
.container pre { |
|
|
background-color: #f8f9fa; |
|
|
padding: 20px; |
|
|
border: 2px solid var(--border-color); |
|
|
white-space: pre-wrap; |
|
|
word-wrap: break-word; |
|
|
color: var(--text-main); |
|
|
border-radius: 8px; |
|
|
box-shadow: inset 0 2px 4px rgba(43, 140, 204, 0.1); |
|
|
} |
|
|
|
|
|
.container pre code { |
|
|
background: none; |
|
|
color: inherit; |
|
|
padding: 0; |
|
|
border-radius: 0; |
|
|
} |
|
|
|
|
|
.container code { |
|
|
font-family: var(--font-code); |
|
|
color: var(--primary-accent); |
|
|
background: rgba(176, 224, 230, 0.2); |
|
|
padding: 3px 6px; |
|
|
border-radius: 4px; |
|
|
} |
|
|
</style> |
|
|
<html lang="en"> |
|
|
<head> |
|
|
<meta charset="UTF-8"> |
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|
|
<title>Iceblink</title> |
|
|
<link rel="preconnect" href="https://fonts.googleapis.com"> |
|
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> |
|
|
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700;800&family=Source+Sans+Pro:ital,wght@0,400;0,600;1,400&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet"> |
|
|
</head> |
|
|
<body> |
|
|
|
|
|
<div class="container"> |
|
|
<div class="title-container"> |
|
|
<div class="glitchy-overlay"></div> |
|
|
<div class="title-wrapper"> |
|
|
<h1 class="title-main"> |
|
|
<span class="title-prefix">ICEBLINK</span> |
|
|
</h1> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
 |
|
|
|
|
|
<div class="section-container"> |
|
|
<div class="section-header"> |
|
|
<div class="section-indicator"></div> |
|
|
<h2 class="section-title">Overview</h2> |
|
|
</div> |
|
|
<div class="section-content"> |
|
|
<p>An experimental GLM4.5 Air finetune.</p> |
|
|
<p>Had this one in the works for a while, but was struggling to find the right hyperparams to get this model to behave nicely. Thank you to TheDrummer for helping me out with them.</p> |
|
|
<p>This model is a creative writing and RP model. It's pretty verbose. The intent is to keep the behavior of the original model, but to slightly improve writing, dialogue & creativity.</p> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<div class="section-container"> |
|
|
<div class="section-header"> |
|
|
<div class="section-indicator"></div> |
|
|
<h2 class="section-title">SillyTavern Settings</h2> |
|
|
</div> |
|
|
<div class="section-content"> |
|
|
<h3 class="subheading">Recommended Roleplay Format</h3> |
|
|
<div class="data-box"> |
|
|
<div class="data-row"> |
|
|
<span class="data-arrow">></span> |
|
|
<span class="data-label">Actions:</span> |
|
|
<span>In plaintext</span> |
|
|
</div> |
|
|
<div class="data-row"> |
|
|
<span class="data-arrow">></span> |
|
|
<span class="data-label">Dialogue:</span> |
|
|
<span>"In quotes"</span> |
|
|
</div> |
|
|
<div class="data-row"> |
|
|
<span class="data-arrow">></span> |
|
|
<span class="data-label">Thoughts:</span> |
|
|
<span>*In asterisks*</span> |
|
|
</div> |
|
|
</div> |
|
|
<h3 class="subheading">Recommended Samplers</h3> |
|
|
<div class="data-box"> |
|
|
<div class="data-row"> |
|
|
<span class="data-arrow">></span> |
|
|
<span class="data-label">Temp:</span> |
|
|
<span>0.8</span> |
|
|
</div> |
|
|
<div class="data-row"> |
|
|
<span class="data-arrow">></span> |
|
|
<span class="data-label">MinP:</span> |
|
|
<span>0.05</span> |
|
|
</div> |
|
|
<div class="data-row"> |
|
|
<span class="data-arrow">></span> |
|
|
<span class="data-label">TopP:</span> |
|
|
<span>0.95</span> |
|
|
</div> |
|
|
</div> |
|
|
<h3 class="subheading">Instruct</h3> |
|
|
<div class="data-box"> |
|
|
<p style="margin: 0;">GLM4.5 (no thinking): <a href="https://huggingface.co/zerofata/GLM-4.5-Iceblink-106B-A12B/raw/main/GLM45-NoThink-SillyTavern-Preset.json">SillyTavern Preset</a></p> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<div class="section-container"> |
|
|
<div class="section-header"> |
|
|
<div class="section-indicator"></div> |
|
|
<h2 class="section-title">Quantizations</h2> |
|
|
</div> |
|
|
<div class="section-content"> |
|
|
<div style="margin-bottom: 20px;"> |
|
|
<h3 class="subheading">GGUF</h3> |
|
|
<div class="data-box"> |
|
|
<div class="data-row"> |
|
|
<span class="data-arrow">></span> |
|
|
<a href="https://huggingface.co/bartowski/zerofata_GLM-4.5-Iceblink-106B-A12B-GGUF">iMatrix (bartowski)</a> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<div class="section-container"> |
|
|
<div class="section-header"> |
|
|
<div class="section-indicator"></div> |
|
|
<h2 class="section-title">Creation Process</h2> |
|
|
</div> |
|
|
<div class="section-content"> |
|
|
<p>Creation Process: SFT</p> |
|
|
<p>SFT on approx 10 million tokens, SFW / NSFW RP, stories, creative instruct & chat data.</p> |
|
|
<p>MoE are brutal to train even with a small dataset like mine, so I took a different approach from usual. I used a very low LR in an effort to avoid having to apply DPO / KTO training afterwards.</p> |
|
|
<p>I think there's likely a better config to be found, but experimentation with the model to find it is quite draining.</p> |
|
|
<div class="dropdown-container"> |
|
|
<details> |
|
|
<summary class="dropdown-summary"> |
|
|
<span class="dropdown-arrow">></span> |
|
|
Axolotl configs |
|
|
</summary> |
|
|
<div class="dropdown-content"> |
|
|
<p>Not optimized for cost / performance efficiency, YMMV.</p> |
|
|
<div class="config-title">SFT (4*H200)</div> |
|
|
<pre><code>base_model: zai-org/GLM-4.5-Air |
|
|
eot_tokens: |
|
|
- "<|user|>" |
|
|
- "<|endoftext|>" |
|
|
special_tokens: |
|
|
eos_token: "<|user|>" |
|
|
<br> |
|
|
# ==================== |
|
|
# DATASET CONFIGURATION |
|
|
# ==================== |
|
|
datasets: |
|
|
- path: ./data/dataset.jsonl |
|
|
type: chat_template |
|
|
split: train |
|
|
field_messages: messages |
|
|
message_property_mappings: |
|
|
role: role |
|
|
content: content |
|
|
roles: |
|
|
user: ["user"] |
|
|
assistant: ["assistant"] |
|
|
system: ["system"] |
|
|
<br> |
|
|
dataset_prepared_path: ./last_run_prepared |
|
|
train_on_inputs: false # Only train on assistant responses |
|
|
eval_sample_packing: False |
|
|
<br> |
|
|
# ==================== |
|
|
# QLORA CONFIGURATION |
|
|
# ==================== |
|
|
adapter: qlora |
|
|
load_in_4bit: true |
|
|
lora_r: 32 |
|
|
lora_alpha: 32 |
|
|
lora_dropout: 0.1 |
|
|
lora_target_modules: |
|
|
- gate_proj |
|
|
- down_proj |
|
|
- up_proj |
|
|
- q_proj |
|
|
- v_proj |
|
|
- k_proj |
|
|
- o_proj |
|
|
# lora_modules_to_save: # Uncomment only if you added NEW tokens |
|
|
<br> |
|
|
# ==================== |
|
|
# TRAINING PARAMETERS |
|
|
# ==================== |
|
|
num_epochs: 3 |
|
|
micro_batch_size: 2 |
|
|
gradient_accumulation_steps: 4 |
|
|
learning_rate: 4.5e-6 |
|
|
optimizer: paged_adamw_8bit |
|
|
lr_scheduler: rex |
|
|
warmup_ratio: 0.05 |
|
|
weight_decay: 0.01 |
|
|
max_grad_norm: 1.0 |
|
|
val_set_size: 0.02 |
|
|
<br> |
|
|
# ==================== |
|
|
# SEQUENCE & PACKING |
|
|
# ==================== |
|
|
sequence_len: 8192 |
|
|
sample_packing: true |
|
|
pad_to_sequence_len: true |
|
|
<br> |
|
|
# ==================== |
|
|
# HARDWARE OPTIMIZATIONS |
|
|
# ==================== |
|
|
bf16: auto |
|
|
flash_attention: true |
|
|
gradient_checkpointing: true |
|
|
<br> |
|
|
plugins: |
|
|
- axolotl.integrations.liger.LigerPlugin |
|
|
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin |
|
|
liger_rope: false |
|
|
liger_rms_norm: true |
|
|
liger_layer_norm: true |
|
|
liger_glu_activation: true |
|
|
liger_fused_linear_cross_entropy: true |
|
|
cut_cross_entropy: false |
|
|
<br> |
|
|
deepspeed: deepspeed_configs/zero1.json |
|
|
<br> |
|
|
# ==================== |
|
|
# EVALUATION & CHECKPOINTING |
|
|
# ==================== |
|
|
save_strategy: steps |
|
|
save_steps: 20 |
|
|
eval_steps: 35 |
|
|
save_total_limit: 18 # Keep best + last few checkpoints |
|
|
load_best_model_at_end: true |
|
|
metric_for_best_model: eval_loss |
|
|
greater_is_better: false |
|
|
<br> |
|
|
# ==================== |
|
|
# LOGGING & OUTPUT |
|
|
# ==================== |
|
|
output_dir: ./GLM-AIR-SFT_v2-5 |
|
|
logging_steps: 1 |
|
|
save_safetensors: true |
|
|
<br> |
|
|
# ==================== |
|
|
# WANDB TRACKING |
|
|
# ==================== |
|
|
wandb_project: GLM-AIR-SFT |
|
|
# wandb_entity: your_entity |
|
|
wandb_name: GLM-AIR-SFT_v2-5</code></pre> |
|
|
</div> |
|
|
</details> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</body> |
|
|
</html> |